diff options
642 files changed, 29810 insertions, 12145 deletions
diff --git a/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCU.svg b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCU.svg new file mode 100644 index 000000000000..727e270b11e4 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCU.svg @@ -0,0 +1,474 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Creator: fig2dev Version 3.2 Patchlevel 5e --> + +<!-- CreationDate: Wed Dec 9 17:28:20 2015 --> + +<!-- Magnification: 3.000 --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="9.1in" + height="8.9in" + viewBox="-66 -66 10932 10707" + id="svg2" + version="1.1" + inkscape:version="0.48.4 r9939" + sodipodi:docname="BigTreeClassicRCU.fig"> + <metadata + id="metadata106"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title></dc:title> + </cc:Work> + </rdf:RDF> + </metadata> + <defs + id="defs104"> + <marker + inkscape:stockid="Arrow1Mend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow1Mend" + style="overflow:visible;"> + <path + id="path3864" + d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z " + style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;" + transform="scale(0.4) rotate(180) translate(10,0)" /> + </marker> + </defs> + <sodipodi:namedview + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1" + objecttolerance="10" + gridtolerance="10" + guidetolerance="10" + inkscape:pageopacity="0" + inkscape:pageshadow="2" + inkscape:window-width="973" + inkscape:window-height="1137" + id="namedview102" + showgrid="false" + inkscape:zoom="0.9743589" + inkscape:cx="409.50003" + inkscape:cy="400.49997" + inkscape:window-x="915" + inkscape:window-y="24" + inkscape:window-maximized="0" + inkscape:current-layer="g4" /> + <g + style="stroke-width:.025in; fill:none" + id="g4"> + <!-- Line: box --> + <rect + x="0" + y="0" + width="10800" + height="5625" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect6" /> + <!-- Line: box --> + <rect + x="1125" + y="3600" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect8" /> + <!-- Line: box --> + <rect + x="3825" + y="900" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect10" /> + <!-- Line: box --> + <rect + x="6525" + y="3600" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect12" /> + <!-- Line --> + <polyline + points="3375,6525 3375,5046 " + style="stroke:#00d1d1;stroke-width:44.9934641;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline14" /> + <!-- Arrowhead on XXXpoint 3375 6525 - 3375 4860--> + <!-- Circle --> + <circle + cx="7425" + cy="6075" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle18" /> + <!-- Circle --> + <circle + cx="7875" + cy="6075" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle20" /> + <!-- Circle --> + <circle + cx="8325" + cy="6075" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle22" /> + <!-- Circle --> + <circle + cx="2025" + cy="6075" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle24" /> + <!-- Circle --> + <circle + cx="2475" + cy="6075" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle26" /> + <!-- Circle --> + <circle + cx="2925" + cy="6075" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle28" /> + <!-- Circle --> + <circle + cx="4725" + cy="4275" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle30" /> + <!-- Circle --> + <circle + cx="5175" + cy="4275" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle32" /> + <!-- Circle --> + <circle + cx="5625" + cy="4275" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle34" /> + <!-- Line: box --> + <rect + x="2025" + y="6525" + width="2700" + height="1800" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect36" /> + <!-- Line --> + <polyline + points="2475,3600 3975,2310 " + style="stroke:#00d1d1;stroke-width:44.9934641;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline38" /> + <!-- Arrowhead on XXXpoint 2475 3600 - 4116 2190--> + <!-- Line --> + <polyline + points="7875,3600 6372,2310 " + style="stroke:#00d1d1;stroke-width:44.9934641;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline42" /> + <!-- Arrowhead on XXXpoint 7875 3600 - 6231 2190--> + <!-- Line --> + <polyline + points="6975,8775 6975,5046 " + style="stroke:#00d1d1;stroke-width:44.9934641;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline46" /> + <!-- Arrowhead on XXXpoint 6975 8775 - 6975 4860--> + <!-- Line --> + <polyline + points="1575,8775 1575,5046 " + style="stroke:#00d1d1;stroke-width:44.9934641;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline50" /> + <!-- Arrowhead on XXXpoint 1575 8775 - 1575 4860--> + <!-- Line --> + <polyline + points="8775,6525 8775,5046 " + style="stroke:#00d1d1;stroke-width:44.9934641;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline54" /> + <!-- Arrowhead on XXXpoint 8775 6525 - 8775 4860--> + <!-- Text --> + <text + xml:space="preserve" + x="1575" + y="9225" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text58">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1575" + y="9675" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text60">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="1575" + y="10350" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text62">CPU 0</text> + <!-- Text --> + <text + xml:space="preserve" + x="3375" + y="6975" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text64">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="3375" + y="7425" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text66">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="3375" + y="8100" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text68">CPU 15</text> + <!-- Text --> + <text + xml:space="preserve" + x="6975" + y="9225" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text70">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="6975" + y="9675" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text72">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="6975" + y="10350" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text74">CPU 1007</text> + <!-- Text --> + <text + xml:space="preserve" + x="8730" + y="6930" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text76">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="8730" + y="7380" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text78">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="8730" + y="8055" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text80">CPU 1023</text> + <!-- Text --> + <text + xml:space="preserve" + x="225" + y="450" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="start" + id="text82">struct rcu_state</text> + <!-- Text --> + <text + xml:space="preserve" + x="2475" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text84">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2475" + y="4500" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text86">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="7875" + y="4500" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text88">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="7875" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text90">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5175" + y="1350" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text92">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5175" + y="1800" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text94">rcu_node</text> + <!-- Line: box --> + <rect + x="225" + y="8775" + width="2700" + height="1800" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect96" /> + <!-- Line: box --> + <rect + x="5625" + y="8775" + width="2700" + height="1800" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect98" /> + <!-- Line: box --> + <rect + x="7380" + y="6480" + width="2700" + height="1800" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect100" /> + </g> +</svg> diff --git a/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg new file mode 100644 index 000000000000..9bbb1944f962 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg @@ -0,0 +1,499 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Creator: fig2dev Version 3.2 Patchlevel 5e --> + +<!-- CreationDate: Wed Dec 9 17:26:09 2015 --> + +<!-- Magnification: 2.000 --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="5.7in" + height="6.6in" + viewBox="-44 -44 6838 7888" + id="svg2" + version="1.1" + inkscape:version="0.48.4 r9939" + sodipodi:docname="BigTreeClassicRCUBH.fig"> + <metadata + id="metadata110"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title></dc:title> + </cc:Work> + </rdf:RDF> + </metadata> + <defs + id="defs108"> + <marker + inkscape:stockid="Arrow1Mend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow1Mend" + style="overflow:visible;"> + <path + id="path3868" + d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z " + style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;" + transform="scale(0.4) rotate(180) translate(10,0)" /> + </marker> + <marker + inkscape:stockid="Arrow2Mend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow2Mend" + style="overflow:visible;"> + <path + id="path3886" + style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;" + d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z " + transform="scale(0.6) rotate(180) translate(0,0)" /> + </marker> + </defs> + <sodipodi:namedview + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1" + objecttolerance="10" + gridtolerance="10" + guidetolerance="10" + inkscape:pageopacity="0" + inkscape:pageshadow="2" + inkscape:window-width="878" + inkscape:window-height="1148" + id="namedview106" + showgrid="false" + inkscape:zoom="1.3547758" + inkscape:cx="256.5" + inkscape:cy="297" + inkscape:window-x="45" + inkscape:window-y="24" + inkscape:window-maximized="0" + inkscape:current-layer="g4" /> + <g + style="stroke-width:.025in; fill:none" + id="g4"> + <!-- Line: box --> + <rect + x="450" + y="0" + width="6300" + height="7350" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect6" /> + <!-- Line: box --> + <rect + x="4950" + y="4950" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect8" /> + <!-- Line: box --> + <rect + x="750" + y="600" + width="5700" + height="3750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect10" /> + <!-- Line: box --> + <rect + x="0" + y="450" + width="6300" + height="7350" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect12" /> + <!-- Line: box --> + <rect + x="300" + y="1050" + width="5700" + height="3750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect14" /> + <!-- Circle --> + <circle + cx="2850" + cy="3900" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle16" /> + <!-- Circle --> + <circle + cx="3150" + cy="3900" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle18" /> + <!-- Circle --> + <circle + cx="3450" + cy="3900" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle20" /> + <!-- Circle --> + <circle + cx="1350" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle22" /> + <!-- Circle --> + <circle + cx="1650" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle24" /> + <!-- Circle --> + <circle + cx="1950" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle26" /> + <!-- Circle --> + <circle + cx="4350" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle28" /> + <!-- Circle --> + <circle + cx="4650" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle30" /> + <!-- Circle --> + <circle + cx="4950" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle32" /> + <!-- Line --> + <polyline + points="1350,3450 2350,2590 " + style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline34" /> + <!-- Arrowhead on XXXpoint 1350 3450 - 2444 2510--> + <!-- Line --> + <polyline + points="4950,3450 3948,2590 " + style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline38" /> + <!-- Arrowhead on XXXpoint 4950 3450 - 3854 2510--> + <!-- Line: box --> + <rect + x="750" + y="3450" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect42" /> + <!-- Line --> + <polyline + points="2250,5400 2250,4414 " + style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline44" /> + <!-- Arrowhead on XXXpoint 2250 5400 - 2250 4290--> + <!-- Line: box --> + <rect + x="1500" + y="5400" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect48" /> + <!-- Line: box --> + <rect + x="300" + y="6600" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect50" /> + <!-- Line: box --> + <rect + x="3750" + y="3450" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect52" /> + <!-- Line: box --> + <rect + x="4500" + y="5400" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect54" /> + <!-- Line: box --> + <rect + x="3300" + y="6600" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect56" /> + <!-- Line: box --> + <rect + x="2250" + y="1650" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect58" /> + <!-- Text --> + <text + xml:space="preserve" + x="6450" + y="300" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="192" + text-anchor="end" + id="text60">rcu_bh</text> + <!-- Text --> + <text + xml:space="preserve" + x="3150" + y="1950" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text62">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="3150" + y="2250" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text64">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="1650" + y="3750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text66">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1650" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text68">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="4650" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text70">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="4650" + y="3750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text72">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2250" + y="5700" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text74">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2250" + y="6000" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text76">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="6900" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text78">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="7200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text80">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="5250" + y="5700" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text82">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5250" + y="6000" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text84">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="6900" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text86">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="7200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text88">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="450" + y="1350" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="start" + id="text90">struct rcu_state</text> + <!-- Text --> + <text + xml:space="preserve" + x="6000" + y="750" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="192" + text-anchor="end" + id="text92">rcu_sched</text> + <!-- Line --> + <polyline + points="5250,5400 5250,4414 " + style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline94" /> + <!-- Arrowhead on XXXpoint 5250 5400 - 5250 4290--> + <!-- Line --> + <polyline + points="4050,6600 4050,4414 " + style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline98" /> + <!-- Arrowhead on XXXpoint 4050 6600 - 4050 4290--> + <!-- Line --> + <polyline + points="1050,6600 1050,4414 " + style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline102" /> + <!-- Arrowhead on XXXpoint 1050 6600 - 1050 4290--> + </g> +</svg> diff --git a/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg new file mode 100644 index 000000000000..21ba7823479d --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg @@ -0,0 +1,695 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Creator: fig2dev Version 3.2 Patchlevel 5e --> + +<!-- CreationDate: Wed Dec 9 17:20:02 2015 --> + +<!-- Magnification: 2.000 --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="5.7in" + height="8.6in" + viewBox="-44 -44 6838 10288" + id="svg2" + version="1.1" + inkscape:version="0.48.4 r9939" + sodipodi:docname="BigTreeClassicRCUBHdyntick.fig"> + <metadata + id="metadata166"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title></dc:title> + </cc:Work> + </rdf:RDF> + </metadata> + <defs + id="defs164"> + <marker + inkscape:stockid="Arrow1Mend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow1Mend" + style="overflow:visible;"> + <path + id="path3924" + d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z " + style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;" + transform="scale(0.4) rotate(180) translate(10,0)" /> + </marker> + <marker + inkscape:stockid="Arrow2Lend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow2Lend" + style="overflow:visible;"> + <path + id="path3936" + style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;" + d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z " + transform="scale(1.1) rotate(180) translate(1,0)" /> + </marker> + </defs> + <sodipodi:namedview + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1" + objecttolerance="10" + gridtolerance="10" + guidetolerance="10" + inkscape:pageopacity="0" + inkscape:pageshadow="2" + inkscape:window-width="845" + inkscape:window-height="988" + id="namedview162" + showgrid="false" + inkscape:zoom="1.0452196" + inkscape:cx="256.5" + inkscape:cy="387.00003" + inkscape:window-x="356" + inkscape:window-y="61" + inkscape:window-maximized="0" + inkscape:current-layer="g4" /> + <g + style="stroke-width:.025in; fill:none" + id="g4"> + <!-- Line: box --> + <rect + x="450" + y="0" + width="6300" + height="7350" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect6" /> + <!-- Line: box --> + <rect + x="4950" + y="4950" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect8" /> + <!-- Line: box --> + <rect + x="750" + y="600" + width="5700" + height="3750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect10" /> + <!-- Line --> + <polyline + points="5250,8100 5688,5912 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline12" /> + <!-- Arrowhead on XXXpoint 5250 8100 - 5710 5790--> + <polyline + points="5714 6068 5704 5822 5598 6044 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline14" /> + <!-- Line --> + <polyline + points="4050,9300 4486,7262 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline16" /> + <!-- Arrowhead on XXXpoint 4050 9300 - 4512 7140--> + <polyline + points="4514 7418 4506 7172 4396 7394 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline18" /> + <!-- Line --> + <polyline + points="1040,9300 1476,7262 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline20" /> + <!-- Arrowhead on XXXpoint 1040 9300 - 1502 7140--> + <polyline + points="1504 7418 1496 7172 1386 7394 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline22" /> + <!-- Line --> + <polyline + points="2240,8100 2676,6062 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline24" /> + <!-- Arrowhead on XXXpoint 2240 8100 - 2702 5940--> + <polyline + points="2704 6218 2696 5972 2586 6194 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline26" /> + <!-- Line: box --> + <rect + x="0" + y="450" + width="6300" + height="7350" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect28" /> + <!-- Line: box --> + <rect + x="300" + y="1050" + width="5700" + height="3750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect30" /> + <!-- Line --> + <polyline + points="1350,3450 2350,2590 " + style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline32" /> + <!-- Arrowhead on XXXpoint 1350 3450 - 2444 2510--> + <!-- Line --> + <polyline + points="4950,3450 3948,2590 " + style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline36" /> + <!-- Arrowhead on XXXpoint 4950 3450 - 3854 2510--> + <!-- Line --> + <polyline + points="4050,6600 4050,4414 " + style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline40" /> + <!-- Arrowhead on XXXpoint 4050 6600 - 4050 4290--> + <!-- Line --> + <polyline + points="1050,6600 1050,4414 " + style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline44" /> + <!-- Arrowhead on XXXpoint 1050 6600 - 1050 4290--> + <!-- Line --> + <polyline + points="2250,5400 2250,4414 " + style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline48" /> + <!-- Arrowhead on XXXpoint 2250 5400 - 2250 4290--> + <!-- Line --> + <polyline + points="2250,8100 2250,6364 " + style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)" + id="polyline52" /> + <!-- Arrowhead on XXXpoint 2250 8100 - 2250 6240--> + <!-- Line --> + <polyline + points="1050,9300 1050,7564 " + style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)" + id="polyline56" /> + <!-- Arrowhead on XXXpoint 1050 9300 - 1050 7440--> + <!-- Line --> + <polyline + points="4050,9300 4050,7564 " + style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)" + id="polyline60" /> + <!-- Arrowhead on XXXpoint 4050 9300 - 4050 7440--> + <!-- Line --> + <polyline + points="5250,8100 5250,6364 " + style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)" + id="polyline64" /> + <!-- Arrowhead on XXXpoint 5250 8100 - 5250 6240--> + <!-- Circle --> + <circle + cx="2850" + cy="3900" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle68" /> + <!-- Circle --> + <circle + cx="3150" + cy="3900" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle70" /> + <!-- Circle --> + <circle + cx="3450" + cy="3900" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle72" /> + <!-- Circle --> + <circle + cx="1350" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle74" /> + <!-- Circle --> + <circle + cx="1650" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle76" /> + <!-- Circle --> + <circle + cx="1950" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle78" /> + <!-- Circle --> + <circle + cx="4350" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle80" /> + <!-- Circle --> + <circle + cx="4650" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle82" /> + <!-- Circle --> + <circle + cx="4950" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle84" /> + <!-- Line: box --> + <rect + x="750" + y="3450" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect86" /> + <!-- Line: box --> + <rect + x="300" + y="6600" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect88" /> + <!-- Line: box --> + <rect + x="3750" + y="3450" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect90" /> + <!-- Line: box --> + <rect + x="4500" + y="5400" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect92" /> + <!-- Line: box --> + <rect + x="3300" + y="6600" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect94" /> + <!-- Line: box --> + <rect + x="2250" + y="1650" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect96" /> + <!-- Line: box --> + <rect + x="0" + y="9300" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect98" /> + <!-- Line: box --> + <rect + x="1350" + y="8100" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect100" /> + <!-- Line: box --> + <rect + x="3000" + y="9300" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect102" /> + <!-- Line: box --> + <rect + x="4350" + y="8100" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect104" /> + <!-- Line: box --> + <rect + x="1500" + y="5400" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect106" /> + <!-- Text --> + <text + xml:space="preserve" + x="6450" + y="300" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="192" + text-anchor="end" + id="text108">rcu_bh</text> + <!-- Text --> + <text + xml:space="preserve" + x="3150" + y="1950" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text110">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="3150" + y="2250" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text112">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="1650" + y="3750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text114">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1650" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text116">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="4650" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text118">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="4650" + y="3750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text120">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2250" + y="5700" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text122">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2250" + y="6000" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text124">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="6900" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text126">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="7200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text128">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="5250" + y="5700" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text130">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5250" + y="6000" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text132">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="6900" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text134">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="7200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text136">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="450" + y="1350" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="start" + id="text138">struct rcu_state</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="9600" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text140">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="9900" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text142">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="9600" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text144">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="9900" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text146">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="2400" + y="8400" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text148">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2400" + y="8700" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text150">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="5400" + y="8400" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text152">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5400" + y="8700" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text154">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="6000" + y="750" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="192" + text-anchor="end" + id="text156">rcu_sched</text> + <!-- Line --> + <polyline + points="5250,5400 5250,4414 " + style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline158" /> + <!-- Arrowhead on XXXpoint 5250 5400 - 5250 4290--> + </g> +</svg> diff --git a/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg new file mode 100644 index 000000000000..15adcac036c7 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg @@ -0,0 +1,741 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Creator: fig2dev Version 3.2 Patchlevel 5e --> + +<!-- CreationDate: Wed Dec 9 17:32:59 2015 --> + +<!-- Magnification: 2.000 --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="6.1in" + height="8.9in" + viewBox="-44 -44 7288 10738" + id="svg2" + version="1.1" + inkscape:version="0.48.4 r9939" + sodipodi:docname="BigTreePreemptRCUBHdyntick.fig"> + <metadata + id="metadata182"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title></dc:title> + </cc:Work> + </rdf:RDF> + </metadata> + <defs + id="defs180"> + <marker + inkscape:stockid="Arrow1Mend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow1Mend" + style="overflow:visible;"> + <path + id="path3940" + d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z " + style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;" + transform="scale(0.4) rotate(180) translate(10,0)" /> + </marker> + </defs> + <sodipodi:namedview + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1" + objecttolerance="10" + gridtolerance="10" + guidetolerance="10" + inkscape:pageopacity="0" + inkscape:pageshadow="2" + inkscape:window-width="874" + inkscape:window-height="1148" + id="namedview178" + showgrid="false" + inkscape:zoom="1.2097379" + inkscape:cx="274.5" + inkscape:cy="400.49997" + inkscape:window-x="946" + inkscape:window-y="24" + inkscape:window-maximized="0" + inkscape:current-layer="g4" /> + <g + style="stroke-width:.025in; fill:none" + id="g4"> + <!-- Line: box --> + <rect + x="900" + y="0" + width="6300" + height="7350" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect6" /> + <!-- Line: box --> + <rect + x="1200" + y="600" + width="5700" + height="3750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect8" /> + <!-- Line: box --> + <rect + x="5400" + y="4950" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect10" /> + <!-- Line: box --> + <rect + x="450" + y="450" + width="6300" + height="7350" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect12" /> + <!-- Line: box --> + <rect + x="750" + y="1050" + width="5700" + height="3750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect14" /> + <!-- Line: box --> + <rect + x="4950" + y="5400" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect16" /> + <!-- Line --> + <polyline + points="5250,8550 5688,6362 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline18" /> + <!-- Arrowhead on XXXpoint 5250 8550 - 5710 6240--> + <polyline + points="5714 6518 5704 6272 5598 6494 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline20" /> + <!-- Line --> + <polyline + points="4050,9750 4486,7712 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline22" /> + <!-- Arrowhead on XXXpoint 4050 9750 - 4512 7590--> + <polyline + points="4514 7868 4506 7622 4396 7844 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline24" /> + <!-- Line --> + <polyline + points="1040,9750 1476,7712 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline26" /> + <!-- Arrowhead on XXXpoint 1040 9750 - 1502 7590--> + <polyline + points="1504 7868 1496 7622 1386 7844 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline28" /> + <!-- Line --> + <polyline + points="2240,8550 2676,6512 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline30" /> + <!-- Arrowhead on XXXpoint 2240 8550 - 2702 6390--> + <polyline + points="2704 6668 2696 6422 2586 6644 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline32" /> + <!-- Line --> + <polyline + points="4050,9750 5682,6360 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline34" /> + <!-- Arrowhead on XXXpoint 4050 9750 - 5736 6246--> + <polyline + points="5672 6518 5722 6276 5562 6466 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline36" /> + <!-- Line --> + <polyline + points="1010,9750 2642,6360 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline38" /> + <!-- Arrowhead on XXXpoint 1010 9750 - 2696 6246--> + <polyline + points="2632 6518 2682 6276 2522 6466 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline40" /> + <!-- Line: box --> + <rect + x="0" + y="900" + width="6300" + height="7350" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect42" /> + <!-- Line: box --> + <rect + x="300" + y="1500" + width="5700" + height="3750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect44" /> + <!-- Line --> + <polyline + points="1350,3900 2350,3040 " + style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline46" /> + <!-- Arrowhead on XXXpoint 1350 3900 - 2444 2960--> + <!-- Line --> + <polyline + points="4950,3900 3948,3040 " + style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline50" /> + <!-- Arrowhead on XXXpoint 4950 3900 - 3854 2960--> + <!-- Line --> + <polyline + points="4050,7050 4050,4864 " + style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline54" /> + <!-- Arrowhead on XXXpoint 4050 7050 - 4050 4740--> + <!-- Line --> + <polyline + points="1050,7050 1050,4864 " + style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline58" /> + <!-- Arrowhead on XXXpoint 1050 7050 - 1050 4740--> + <!-- Line --> + <polyline + points="2250,5850 2250,4864 " + style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline62" /> + <!-- Arrowhead on XXXpoint 2250 5850 - 2250 4740--> + <!-- Line --> + <polyline + points="2250,8550 2250,6814 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline66" /> + <!-- Arrowhead on XXXpoint 2250 8550 - 2250 6690--> + <!-- Line --> + <polyline + points="1050,9750 1050,8014 " + style="stroke:#00ff00;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline70" /> + <!-- Arrowhead on XXXpoint 1050 9750 - 1050 7890--> + <!-- Line --> + <polyline + points="4050,9750 4050,8014 " + style="stroke:#00ff00;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline74" /> + <!-- Arrowhead on XXXpoint 4050 9750 - 4050 7890--> + <!-- Line --> + <polyline + points="5250,8550 5250,6814 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline78" /> + <!-- Arrowhead on XXXpoint 5250 8550 - 5250 6690--> + <!-- Circle --> + <circle + cx="2850" + cy="4350" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle82" /> + <!-- Circle --> + <circle + cx="3150" + cy="4350" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle84" /> + <!-- Circle --> + <circle + cx="3450" + cy="4350" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle86" /> + <!-- Circle --> + <circle + cx="1350" + cy="5550" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle88" /> + <!-- Circle --> + <circle + cx="1650" + cy="5550" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle90" /> + <!-- Circle --> + <circle + cx="1950" + cy="5550" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle92" /> + <!-- Circle --> + <circle + cx="4350" + cy="5550" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle94" /> + <!-- Circle --> + <circle + cx="4650" + cy="5550" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle96" /> + <!-- Circle --> + <circle + cx="4950" + cy="5550" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle98" /> + <!-- Line: box --> + <rect + x="750" + y="3900" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect100" /> + <!-- Line: box --> + <rect + x="300" + y="7050" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect102" /> + <!-- Line: box --> + <rect + x="3750" + y="3900" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect104" /> + <!-- Line: box --> + <rect + x="4500" + y="5850" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect106" /> + <!-- Line: box --> + <rect + x="3300" + y="7050" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect108" /> + <!-- Line: box --> + <rect + x="2250" + y="2100" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect110" /> + <!-- Line: box --> + <rect + x="0" + y="9750" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect112" /> + <!-- Line: box --> + <rect + x="1350" + y="8550" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect114" /> + <!-- Line: box --> + <rect + x="3000" + y="9750" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect116" /> + <!-- Line: box --> + <rect + x="4350" + y="8550" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect118" /> + <!-- Line: box --> + <rect + x="1500" + y="5850" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect120" /> + <!-- Text --> + <text + xml:space="preserve" + x="6450" + y="750" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="192" + text-anchor="end" + id="text122">rcu_bh</text> + <!-- Text --> + <text + xml:space="preserve" + x="3150" + y="2400" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text124">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="3150" + y="2700" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text126">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="1650" + y="4200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text128">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1650" + y="4500" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text130">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="4650" + y="4500" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text132">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="4650" + y="4200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text134">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2250" + y="6150" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text136">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2250" + y="6450" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text138">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="7350" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text140">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="7650" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text142">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="5250" + y="6150" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text144">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5250" + y="6450" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text146">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="7350" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text148">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="7650" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text150">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="450" + y="1800" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="start" + id="text152">struct rcu_state</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="10050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text154">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="10350" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text156">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="10050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text158">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="10350" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text160">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="2400" + y="8850" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text162">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2400" + y="9150" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text164">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="5400" + y="8850" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text166">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5400" + y="9150" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text168">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="6900" + y="300" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="192" + text-anchor="end" + id="text170">rcu_preempt</text> + <!-- Text --> + <text + xml:space="preserve" + x="6000" + y="1200" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="192" + text-anchor="end" + id="text172">rcu_sched</text> + <!-- Line --> + <polyline + points="5250,5850 5250,4864 " + style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline174" /> + <!-- Arrowhead on XXXpoint 5250 5850 - 5250 4740--> + </g> +</svg> diff --git a/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg new file mode 100644 index 000000000000..bbc3801470d0 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg @@ -0,0 +1,858 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Creator: fig2dev Version 3.2 Patchlevel 5e --> + +<!-- CreationDate: Wed Dec 9 17:29:48 2015 --> + +<!-- Magnification: 2.000 --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="7.4in" + height="9.9in" + viewBox="-44 -44 8938 11938" + id="svg2" + version="1.1" + inkscape:version="0.48.4 r9939" + sodipodi:docname="BigTreePreemptRCUBHdyntickCB.svg"> + <metadata + id="metadata212"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title></dc:title> + </cc:Work> + </rdf:RDF> + </metadata> + <defs + id="defs210"> + <marker + inkscape:stockid="Arrow1Mend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow1Mend" + style="overflow:visible;"> + <path + id="path3970" + d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z " + style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;" + transform="scale(0.4) rotate(180) translate(10,0)" /> + </marker> + </defs> + <sodipodi:namedview + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1" + objecttolerance="10" + gridtolerance="10" + guidetolerance="10" + inkscape:pageopacity="0" + inkscape:pageshadow="2" + inkscape:window-width="881" + inkscape:window-height="1128" + id="namedview208" + showgrid="false" + inkscape:zoom="1.0195195" + inkscape:cx="333" + inkscape:cy="445.49997" + inkscape:window-x="936" + inkscape:window-y="24" + inkscape:window-maximized="0" + inkscape:current-layer="g4" /> + <g + style="stroke-width:.025in; fill:none" + id="g4"> + <!-- Line: box --> + <rect + x="900" + y="0" + width="6300" + height="7350" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect6" /> + <!-- Line: box --> + <rect + x="1200" + y="600" + width="5700" + height="3750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect8" /> + <!-- Line: box --> + <rect + x="5400" + y="4950" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect10" /> + <!-- Line: box --> + <rect + x="450" + y="450" + width="6300" + height="7350" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect12" /> + <!-- Line: box --> + <rect + x="750" + y="1050" + width="5700" + height="3750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect14" /> + <!-- Line: box --> + <rect + x="4950" + y="5400" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect16" /> + <!-- Line --> + <polyline + points="5250,8550 5688,6362 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline18" /> + <!-- Arrowhead on XXXpoint 5250 8550 - 5710 6240--> + <polyline + points="5714 6518 5704 6272 5598 6494 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline20" /> + <!-- Line --> + <polyline + points="4050,9750 4486,7712 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline22" /> + <!-- Arrowhead on XXXpoint 4050 9750 - 4512 7590--> + <polyline + points="4514 7868 4506 7622 4396 7844 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline24" /> + <!-- Line --> + <polyline + points="1040,9750 1476,7712 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline26" /> + <!-- Arrowhead on XXXpoint 1040 9750 - 1502 7590--> + <polyline + points="1504 7868 1496 7622 1386 7844 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline28" /> + <!-- Line --> + <polyline + points="2240,8550 2676,6512 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline30" /> + <!-- Arrowhead on XXXpoint 2240 8550 - 2702 6390--> + <polyline + points="2704 6668 2696 6422 2586 6644 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline32" /> + <!-- Line --> + <polyline + points="4050,9600 5692,6062 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline34" /> + <!-- Arrowhead on XXXpoint 4050 9600 - 5744 5948--> + <polyline + points="5682 6220 5730 5978 5574 6170 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline36" /> + <!-- Line --> + <polyline + points="1086,9600 2728,6062 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline38" /> + <!-- Arrowhead on XXXpoint 1086 9600 - 2780 5948--> + <polyline + points="2718 6220 2766 5978 2610 6170 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline40" /> + <!-- Line: box --> + <rect + x="0" + y="900" + width="6300" + height="7350" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect42" /> + <!-- Line: box --> + <rect + x="300" + y="1500" + width="5700" + height="3750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect44" /> + <!-- Line --> + <polyline + points="1350,3900 2350,3040 " + style="stroke:#00d1d1;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline46" /> + <!-- Arrowhead on XXXpoint 1350 3900 - 2444 2960--> + <!-- Line --> + <polyline + points="4950,3900 3948,3040 " + style="stroke:#00d1d1;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline50" /> + <!-- Arrowhead on XXXpoint 4950 3900 - 3854 2960--> + <!-- Line --> + <polyline + points="4050,7050 4050,4864 " + style="stroke:#00d1d1;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline54" /> + <!-- Arrowhead on XXXpoint 4050 7050 - 4050 4740--> + <!-- Line --> + <polyline + points="1050,7050 1050,4864 " + style="stroke:#00d1d1;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline58" /> + <!-- Arrowhead on XXXpoint 1050 7050 - 1050 4740--> + <!-- Line --> + <polyline + points="2250,5850 2250,4864 " + style="stroke:#00d1d1;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline62" /> + <!-- Arrowhead on XXXpoint 2250 5850 - 2250 4740--> + <!-- Line --> + <polyline + points="2250,8550 2250,6814 " + style="stroke:#00ff00;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline66" /> + <!-- Arrowhead on XXXpoint 2250 8550 - 2250 6690--> + <!-- Line --> + <polyline + points="1050,9750 1050,8014 " + style="stroke:#00ff00;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline70" /> + <!-- Arrowhead on XXXpoint 1050 9750 - 1050 7890--> + <!-- Line --> + <polyline + points="4050,9750 4050,8014 " + style="stroke:#00ff00;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline74" /> + <!-- Arrowhead on XXXpoint 4050 9750 - 4050 7890--> + <!-- Line --> + <polyline + points="5250,8550 5250,6814 " + style="stroke:#00ff00;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline78" /> + <!-- Arrowhead on XXXpoint 5250 8550 - 5250 6690--> + <!-- Line --> + <polyline + points="6000,6300 8048,7910 " + style="stroke:#87cfff;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)" + id="polyline82" /> + <!-- Arrowhead on XXXpoint 6000 6300 - 8146 7986--> + <!-- Circle --> + <circle + cx="2850" + cy="4350" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle86" /> + <!-- Circle --> + <circle + cx="3150" + cy="4350" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle88" /> + <!-- Circle --> + <circle + cx="3450" + cy="4350" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle90" /> + <!-- Circle --> + <circle + cx="1350" + cy="5550" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle92" /> + <!-- Circle --> + <circle + cx="1650" + cy="5550" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle94" /> + <!-- Circle --> + <circle + cx="1950" + cy="5550" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle96" /> + <!-- Circle --> + <circle + cx="4350" + cy="5550" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle98" /> + <!-- Circle --> + <circle + cx="4650" + cy="5550" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle100" /> + <!-- Circle --> + <circle + cx="4950" + cy="5550" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle102" /> + <!-- Line: box --> + <rect + x="7350" + y="7950" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect104" /> + <!-- Line: box --> + <rect + x="7350" + y="9450" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect106" /> + <!-- Line --> + <polyline + points="8100,8850 8100,9384 " + style="stroke:#000000;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)" + id="polyline108" /> + <!-- Arrowhead on XXXpoint 8100 8850 - 8100 9510--> + <!-- Line: box --> + <rect + x="7350" + y="10950" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect112" /> + <!-- Line --> + <polyline + points="8100,10350 8100,10884 " + style="stroke:#000000;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)" + id="polyline114" /> + <!-- Arrowhead on XXXpoint 8100 10350 - 8100 11010--> + <!-- Line: box --> + <rect + x="750" + y="3900" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect118" /> + <!-- Line: box --> + <rect + x="300" + y="7050" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect120" /> + <!-- Line: box --> + <rect + x="3750" + y="3900" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect122" /> + <!-- Line: box --> + <rect + x="4500" + y="5850" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect124" /> + <!-- Line: box --> + <rect + x="3300" + y="7050" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect126" /> + <!-- Line: box --> + <rect + x="2250" + y="2100" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect128" /> + <!-- Line: box --> + <rect + x="0" + y="9750" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect130" /> + <!-- Line: box --> + <rect + x="1350" + y="8550" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect132" /> + <!-- Line: box --> + <rect + x="3000" + y="9750" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect134" /> + <!-- Line: box --> + <rect + x="4350" + y="8550" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect136" /> + <!-- Line: box --> + <rect + x="1500" + y="5850" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect138" /> + <!-- Text --> + <text + xml:space="preserve" + x="8100" + y="8250" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text140">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="8100" + y="8550" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text142">rcu_head</text> + <!-- Text --> + <text + xml:space="preserve" + x="8100" + y="9750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text144">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="8100" + y="10050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text146">rcu_head</text> + <!-- Text --> + <text + xml:space="preserve" + x="8100" + y="11250" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text148">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="8100" + y="11550" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text150">rcu_head</text> + <!-- Text --> + <text + xml:space="preserve" + x="6000" + y="1200" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="192" + text-anchor="end" + id="text152">rcu_sched</text> + <!-- Text --> + <text + xml:space="preserve" + x="6450" + y="750" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="192" + text-anchor="end" + id="text154">rcu_bh</text> + <!-- Text --> + <text + xml:space="preserve" + x="3150" + y="2400" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text156">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="3150" + y="2700" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text158">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="1650" + y="4200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text160">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1650" + y="4500" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text162">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="4650" + y="4500" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text164">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="4650" + y="4200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text166">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2250" + y="6150" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text168">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2250" + y="6450" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text170">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="7350" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text172">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="7650" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text174">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="5250" + y="6150" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text176">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5250" + y="6450" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text178">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="7350" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text180">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="7650" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text182">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="450" + y="1800" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="start" + id="text184">struct rcu_state</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="10050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text186">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="10350" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text188">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="10050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text190">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="10350" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text192">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="2400" + y="8850" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text194">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2400" + y="9150" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text196">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="5400" + y="8850" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text198">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5400" + y="9150" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text200">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="6900" + y="300" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="192" + text-anchor="end" + id="text202">rcu_preempt</text> + <!-- Line --> + <polyline + points="5250,5850 5250,4864 " + style="stroke:#00d1d1;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline204" /> + <!-- Arrowhead on XXXpoint 5250 5850 - 5250 4740--> + </g> +</svg> diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html new file mode 100644 index 000000000000..7eb47ac25ad7 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html @@ -0,0 +1,1333 @@ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" + "http://www.w3.org/TR/html4/loose.dtd"> + <html> + <head><title>A Tour Through TREE_RCU's Data Structures [LWN.net]</title> + <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=iso-8859-1"> + + <p>January 27, 2016</p> + <p>This article was contributed by Paul E. McKenney</p> + +<h3>Introduction</h3> + +This document describes RCU's major data structures and their relationship +to each other. + +<ol> +<li> <a href="#Data-Structure Relationships"> + Data-Structure Relationships</a> +<li> <a href="#The rcu_state Structure"> + The <tt>rcu_state</tt> Structure</a> +<li> <a href="#The rcu_node Structure"> + The <tt>rcu_node</tt> Structure</a> +<li> <a href="#The rcu_data Structure"> + The <tt>rcu_data</tt> Structure</a> +<li> <a href="#The rcu_dynticks Structure"> + The <tt>rcu_dynticks</tt> Structure</a> +<li> <a href="#The rcu_head Structure"> + The <tt>rcu_head</tt> Structure</a> +<li> <a href="#RCU-Specific Fields in the task_struct Structure"> + RCU-Specific Fields in the <tt>task_struct</tt> Structure</a> +<li> <a href="#Accessor Functions"> + Accessor Functions</a> +</ol> + +At the end we have the +<a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>. + +<h3><a name="Data-Structure Relationships">Data-Structure Relationships</a></h3> + +<p>RCU is for all intents and purposes a large state machine, and its +data structures maintain the state in such a way as to allow RCU readers +to execute extremely quickly, while also processing the RCU grace periods +requested by updaters in an efficient and extremely scalable fashion. +The efficiency and scalability of RCU updaters is provided primarily +by a combining tree, as shown below: + +</p><p><img src="BigTreeClassicRCU.svg" alt="BigTreeClassicRCU.svg" width="30%"> + +</p><p>This diagram shows an enclosing <tt>rcu_state</tt> structure +containing a tree of <tt>rcu_node</tt> structures. +Each leaf node of the <tt>rcu_node</tt> tree has up to 16 +<tt>rcu_data</tt> structures associated with it, so that there +are <tt>NR_CPUS</tt> number of <tt>rcu_data</tt> structures, +one for each possible CPU. +This structure is adjusted at boot time, if needed, to handle the +common case where <tt>nr_cpu_ids</tt> is much less than +<tt>NR_CPUs</tt>. +For example, a number of Linux distributions set <tt>NR_CPUs=4096</tt>, +which results in a three-level <tt>rcu_node</tt> tree. +If the actual hardware has only 16 CPUs, RCU will adjust itself +at boot time, resulting in an <tt>rcu_node</tt> tree with only a single node. + +</p><p>The purpose of this combining tree is to allow per-CPU events +such as quiescent states, dyntick-idle transitions, +and CPU hotplug operations to be processed efficiently +and scalably. +Quiescent states are recorded by the per-CPU <tt>rcu_data</tt> structures, +and other events are recorded by the leaf-level <tt>rcu_node</tt> +structures. +All of these events are combined at each level of the tree until finally +grace periods are completed at the tree's root <tt>rcu_node</tt> +structure. +A grace period can be completed at the root once every CPU +(or, in the case of <tt>CONFIG_PREEMPT_RCU</tt>, task) +has passed through a quiescent state. +Once a grace period has completed, record of that fact is propagated +back down the tree. + +</p><p>As can be seen from the diagram, on a 64-bit system +a two-level tree with 64 leaves can accommodate 1,024 CPUs, with a fanout +of 64 at the root and a fanout of 16 at the leaves. + +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + Why isn't the fanout at the leaves also 64? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + Because there are more types of events that affect the leaf-level + <tt>rcu_node</tt> structures than further up the tree. + Therefore, if the leaf <tt>rcu_node</tt> structures have fanout of + 64, the contention on these structures' <tt>->structures</tt> + becomes excessive. + Experimentation on a wide variety of systems has shown that a fanout + of 16 works well for the leaves of the <tt>rcu_node</tt> tree. + </font> + + <p><font color="ffffff">Of course, further experience with + systems having hundreds or thousands of CPUs may demonstrate + that the fanout for the non-leaf <tt>rcu_node</tt> structures + must also be reduced. + Such reduction can be easily carried out when and if it proves + necessary. + In the meantime, if you are using such a system and running into + contention problems on the non-leaf <tt>rcu_node</tt> structures, + you may use the <tt>CONFIG_RCU_FANOUT</tt> kernel configuration + parameter to reduce the non-leaf fanout as needed. + </font> + + <p><font color="ffffff">Kernels built for systems with + strong NUMA characteristics might also need to adjust + <tt>CONFIG_RCU_FANOUT</tt> so that the domains of the + <tt>rcu_node</tt> structures align with hardware boundaries. + However, there has thus far been no need for this. +</font></td></tr> +<tr><td> </td></tr> +</table> + +<p>If your system has more than 1,024 CPUs (or more than 512 CPUs on +a 32-bit system), then RCU will automatically add more levels to the +tree. +For example, if you are crazy enough to build a 64-bit system with 65,536 +CPUs, RCU would configure the <tt>rcu_node</tt> tree as follows: + +</p><p><img src="HugeTreeClassicRCU.svg" alt="HugeTreeClassicRCU.svg" width="50%"> + +</p><p>RCU currently permits up to a four-level tree, which on a 64-bit system +accommodates up to 4,194,304 CPUs, though only a mere 524,288 CPUs for +32-bit systems. +On the other hand, you can set <tt>CONFIG_RCU_FANOUT</tt> to be +as small as 2 if you wish, which would permit only 16 CPUs, which +is useful for testing. + +</p><p>This multi-level combining tree allows us to get most of the +performance and scalability +benefits of partitioning, even though RCU grace-period detection is +inherently a global operation. +The trick here is that only the last CPU to report a quiescent state +into a given <tt>rcu_node</tt> structure need advance to the <tt>rcu_node</tt> +structure at the next level up the tree. +This means that at the leaf-level <tt>rcu_node</tt> structure, only +one access out of sixteen will progress up the tree. +For the internal <tt>rcu_node</tt> structures, the situation is even +more extreme: Only one access out of sixty-four will progress up +the tree. +Because the vast majority of the CPUs do not progress up the tree, +the lock contention remains roughly constant up the tree. +No matter how many CPUs there are in the system, at most 64 quiescent-state +reports per grace period will progress all the way to the root +<tt>rcu_node</tt> structure, thus ensuring that the lock contention +on that root <tt>rcu_node</tt> structure remains acceptably low. + +</p><p>In effect, the combining tree acts like a big shock absorber, +keeping lock contention under control at all tree levels regardless +of the level of loading on the system. + +</p><p>The Linux kernel actually supports multiple flavors of RCU +running concurrently, so RCU builds separate data structures for each +flavor. +For example, for <tt>CONFIG_TREE_RCU=y</tt> kernels, RCU provides +rcu_sched and rcu_bh, as shown below: + +</p><p><img src="BigTreeClassicRCUBH.svg" alt="BigTreeClassicRCUBH.svg" width="33%"> + +</p><p>Energy efficiency is increasingly important, and for that +reason the Linux kernel provides <tt>CONFIG_NO_HZ_IDLE</tt>, which +turns off the scheduling-clock interrupts on idle CPUs, which in +turn allows those CPUs to attain deeper sleep states and to consume +less energy. +CPUs whose scheduling-clock interrupts have been turned off are +said to be in <i>dyntick-idle mode</i>. +RCU must handle dyntick-idle CPUs specially +because RCU would otherwise wake up each CPU on every grace period, +which would defeat the whole purpose of <tt>CONFIG_NO_HZ_IDLE</tt>. +RCU uses the <tt>rcu_dynticks</tt> structure to track +which CPUs are in dyntick idle mode, as shown below: + +</p><p><img src="BigTreeClassicRCUBHdyntick.svg" alt="BigTreeClassicRCUBHdyntick.svg" width="33%"> + +</p><p>However, if a CPU is in dyntick-idle mode, it is in that mode +for all flavors of RCU. +Therefore, a single <tt>rcu_dynticks</tt> structure is allocated per +CPU, and all of a given CPU's <tt>rcu_data</tt> structures share +that <tt>rcu_dynticks</tt>, as shown in the figure. + +</p><p>Kernels built with <tt>CONFIG_PREEMPT_RCU</tt> support +rcu_preempt in addition to rcu_sched and rcu_bh, as shown below: + +</p><p><img src="BigTreePreemptRCUBHdyntick.svg" alt="BigTreePreemptRCUBHdyntick.svg" width="35%"> + +</p><p>RCU updaters wait for normal grace periods by registering +RCU callbacks, either directly via <tt>call_rcu()</tt> and +friends (namely <tt>call_rcu_bh()</tt> and <tt>call_rcu_sched()</tt>), +there being a separate interface per flavor of RCU) +or indirectly via <tt>synchronize_rcu()</tt> and friends. +RCU callbacks are represented by <tt>rcu_head</tt> structures, +which are queued on <tt>rcu_data</tt> structures while they are +waiting for a grace period to elapse, as shown in the following figure: + +</p><p><img src="BigTreePreemptRCUBHdyntickCB.svg" alt="BigTreePreemptRCUBHdyntickCB.svg" width="40%"> + +</p><p>This figure shows how <tt>TREE_RCU</tt>'s and +<tt>PREEMPT_RCU</tt>'s major data structures are related. +Lesser data structures will be introduced with the algorithms that +make use of them. + +</p><p>Note that each of the data structures in the above figure has +its own synchronization: + +<p><ol> +<li> Each <tt>rcu_state</tt> structures has a lock and a mutex, + and some fields are protected by the corresponding root + <tt>rcu_node</tt> structure's lock. +<li> Each <tt>rcu_node</tt> structure has a spinlock. +<li> The fields in <tt>rcu_data</tt> are private to the corresponding + CPU, although a few can be read and written by other CPUs. +<li> Similarly, the fields in <tt>rcu_dynticks</tt> are private + to the corresponding CPU, although a few can be read by + other CPUs. +</ol> + +<p>It is important to note that different data structures can have +very different ideas about the state of RCU at any given time. +For but one example, awareness of the start or end of a given RCU +grace period propagates slowly through the data structures. +This slow propagation is absolutely necessary for RCU to have good +read-side performance. +If this balkanized implementation seems foreign to you, one useful +trick is to consider each instance of these data structures to be +a different person, each having the usual slightly different +view of reality. + +</p><p>The general role of each of these data structures is as +follows: + +</p><ol> +<li> <tt>rcu_state</tt>: + This structure forms the interconnection between the + <tt>rcu_node</tt> and <tt>rcu_data</tt> structures, + tracks grace periods, serves as short-term repository + for callbacks orphaned by CPU-hotplug events, + maintains <tt>rcu_barrier()</tt> state, + tracks expedited grace-period state, + and maintains state used to force quiescent states when + grace periods extend too long, +<li> <tt>rcu_node</tt>: This structure forms the combining + tree that propagates quiescent-state + information from the leaves to the root, and also propagates + grace-period information from the root to the leaves. + It provides local copies of the grace-period state in order + to allow this information to be accessed in a synchronized + manner without suffering the scalability limitations that + would otherwise be imposed by global locking. + In <tt>CONFIG_PREEMPT_RCU</tt> kernels, it manages the lists + of tasks that have blocked while in their current + RCU read-side critical section. + In <tt>CONFIG_PREEMPT_RCU</tt> with + <tt>CONFIG_RCU_BOOST</tt>, it manages the + per-<tt>rcu_node</tt> priority-boosting + kernel threads (kthreads) and state. + Finally, it records CPU-hotplug state in order to determine + which CPUs should be ignored during a given grace period. +<li> <tt>rcu_data</tt>: This per-CPU structure is the + focus of quiescent-state detection and RCU callback queuing. + It also tracks its relationship to the corresponding leaf + <tt>rcu_node</tt> structure to allow more-efficient + propagation of quiescent states up the <tt>rcu_node</tt> + combining tree. + Like the <tt>rcu_node</tt> structure, it provides a local + copy of the grace-period information to allow for-free + synchronized + access to this information from the corresponding CPU. + Finally, this structure records past dyntick-idle state + for the corresponding CPU and also tracks statistics. +<li> <tt>rcu_dynticks</tt>: + This per-CPU structure tracks the current dyntick-idle + state for the corresponding CPU. + Unlike the other three structures, the <tt>rcu_dynticks</tt> + structure is not replicated per RCU flavor. +<li> <tt>rcu_head</tt>: + This structure represents RCU callbacks, and is the + only structure allocated and managed by RCU users. + The <tt>rcu_head</tt> structure is normally embedded + within the RCU-protected data structure. +</ol> + +<p>If all you wanted from this article was a general notion of how +RCU's data structures are related, you are done. +Otherwise, each of the following sections give more details on +the <tt>rcu_state</tt>, <tt>rcu_node</tt>, <tt>rcu_data</tt>, +and <tt>rcu_dynticks</tt> data structures. + +<h3><a name="The rcu_state Structure"> +The <tt>rcu_state</tt> Structure</a></h3> + +<p>The <tt>rcu_state</tt> structure is the base structure that +represents a flavor of RCU. +This structure forms the interconnection between the +<tt>rcu_node</tt> and <tt>rcu_data</tt> structures, +tracks grace periods, contains the lock used to +synchronize with CPU-hotplug events, +and maintains state used to force quiescent states when +grace periods extend too long, + +</p><p>A few of the <tt>rcu_state</tt> structure's fields are discussed, +singly and in groups, in the following sections. +The more specialized fields are covered in the discussion of their +use. + +<h5>Relationship to rcu_node and rcu_data Structures</h5> + +This portion of the <tt>rcu_state</tt> structure is declared +as follows: + +<pre> + 1 struct rcu_node node[NUM_RCU_NODES]; + 2 struct rcu_node *level[NUM_RCU_LVLS + 1]; + 3 struct rcu_data __percpu *rda; +</pre> + +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + Wait a minute! + You said that the <tt>rcu_node</tt> structures formed a tree, + but they are declared as a flat array! + What gives? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + The tree is laid out in the array. + The first node In the array is the head, the next set of nodes in the + array are children of the head node, and so on until the last set of + nodes in the array are the leaves. + </font> + + <p><font color="ffffff">See the following diagrams to see how + this works. +</font></td></tr> +<tr><td> </td></tr> +</table> + +<p>The <tt>rcu_node</tt> tree is embedded into the +<tt>->node[]</tt> array as shown in the following figure: + +</p><p><img src="TreeMapping.svg" alt="TreeMapping.svg" width="40%"> + +</p><p>One interesting consequence of this mapping is that a +breadth-first traversal of the tree is implemented as a simple +linear scan of the array, which is in fact what the +<tt>rcu_for_each_node_breadth_first()</tt> macro does. +This macro is used at the beginning and ends of grace periods. + +</p><p>Each entry of the <tt>->level</tt> array references +the first <tt>rcu_node</tt> structure on the corresponding level +of the tree, for example, as shown below: + +</p><p><img src="TreeMappingLevel.svg" alt="TreeMappingLevel.svg" width="40%"> + +</p><p>The zero<sup>th</sup> element of the array references the root +<tt>rcu_node</tt> structure, the first element references the +first child of the root <tt>rcu_node</tt>, and finally the second +element references the first leaf <tt>rcu_node</tt> structure. + +</p><p>For whatever it is worth, if you draw the tree to be tree-shaped +rather than array-shaped, it is easy to draw a planar representation: + +</p><p><img src="TreeLevel.svg" alt="TreeLevel.svg" width="60%"> + +</p><p>Finally, the <tt>->rda</tt> field references a per-CPU +pointer to the corresponding CPU's <tt>rcu_data</tt> structure. + +</p><p>All of these fields are constant once initialization is complete, +and therefore need no protection. + +<h5>Grace-Period Tracking</h5> + +<p>This portion of the <tt>rcu_state</tt> structure is declared +as follows: + +<pre> + 1 unsigned long gpnum; + 2 unsigned long completed; +</pre> + +<p>RCU grace periods are numbered, and +the <tt>->gpnum</tt> field contains the number of the grace +period that started most recently. +The <tt>->completed</tt> field contains the number of the +grace period that completed most recently. +If the two fields are equal, the RCU grace period that most recently +started has already completed, and therefore the corresponding +flavor of RCU is idle. +If <tt>->gpnum</tt> is one greater than <tt>->completed</tt>, +then <tt>->gpnum</tt> gives the number of the current RCU +grace period, which has not yet completed. +Any other combination of values indicates that something is broken. +These two fields are protected by the root <tt>rcu_node</tt>'s +<tt>->lock</tt> field. + +</p><p>There are <tt>->gpnum</tt> and <tt>->completed</tt> fields +in the <tt>rcu_node</tt> and <tt>rcu_data</tt> structures +as well. +The fields in the <tt>rcu_state</tt> structure represent the +most current values, and those of the other structures are compared +in order to detect the start of a new grace period in a distributed +fashion. +The values flow from <tt>rcu_state</tt> to <tt>rcu_node</tt> +(down the tree from the root to the leaves) to <tt>rcu_data</tt>. + +<h5>Miscellaneous</h5> + +<p>This portion of the <tt>rcu_state</tt> structure is declared +as follows: + +<pre> + 1 unsigned long gp_max; + 2 char abbr; + 3 char *name; +</pre> + +<p>The <tt>->gp_max</tt> field tracks the duration of the longest +grace period in jiffies. +It is protected by the root <tt>rcu_node</tt>'s <tt>->lock</tt>. + +<p>The <tt>->name</tt> field points to the name of the RCU flavor +(for example, “rcu_sched”), and is constant. +The <tt>->abbr</tt> field contains a one-character abbreviation, +for example, “s” for RCU-sched. + +<h3><a name="The rcu_node Structure"> +The <tt>rcu_node</tt> Structure</a></h3> + +<p>The <tt>rcu_node</tt> structures form the combining +tree that propagates quiescent-state +information from the leaves to the root and also that propagates +grace-period information from the root down to the leaves. +They provides local copies of the grace-period state in order +to allow this information to be accessed in a synchronized +manner without suffering the scalability limitations that +would otherwise be imposed by global locking. +In <tt>CONFIG_PREEMPT_RCU</tt> kernels, they manage the lists +of tasks that have blocked while in their current +RCU read-side critical section. +In <tt>CONFIG_PREEMPT_RCU</tt> with +<tt>CONFIG_RCU_BOOST</tt>, they manage the +per-<tt>rcu_node</tt> priority-boosting +kernel threads (kthreads) and state. +Finally, they record CPU-hotplug state in order to determine +which CPUs should be ignored during a given grace period. + +</p><p>The <tt>rcu_node</tt> structure's fields are discussed, +singly and in groups, in the following sections. + +<h5>Connection to Combining Tree</h5> + +<p>This portion of the <tt>rcu_node</tt> structure is declared +as follows: + +<pre> + 1 struct rcu_node *parent; + 2 u8 level; + 3 u8 grpnum; + 4 unsigned long grpmask; + 5 int grplo; + 6 int grphi; +</pre> + +<p>The <tt>->parent</tt> pointer references the <tt>rcu_node</tt> +one level up in the tree, and is <tt>NULL</tt> for the root +<tt>rcu_node</tt>. +The RCU implementation makes heavy use of this field to push quiescent +states up the tree. +The <tt>->level</tt> field gives the level in the tree, with +the root being at level zero, its children at level one, and so on. +The <tt>->grpnum</tt> field gives this node's position within +the children of its parent, so this number can range between 0 and 31 +on 32-bit systems and between 0 and 63 on 64-bit systems. +The <tt>->level</tt> and <tt>->grpnum</tt> fields are +used only during initialization and for tracing. +The <tt>->grpmask</tt> field is the bitmask counterpart of +<tt>->grpnum</tt>, and therefore always has exactly one bit set. +This mask is used to clear the bit corresponding to this <tt>rcu_node</tt> +structure in its parent's bitmasks, which are described later. +Finally, the <tt>->grplo</tt> and <tt>->grphi</tt> fields +contain the lowest and highest numbered CPU served by this +<tt>rcu_node</tt> structure, respectively. + +</p><p>All of these fields are constant, and thus do not require any +synchronization. + +<h5>Synchronization</h5> + +<p>This field of the <tt>rcu_node</tt> structure is declared +as follows: + +<pre> + 1 raw_spinlock_t lock; +</pre> + +<p>This field is used to protect the remaining fields in this structure, +unless otherwise stated. +That said, all of the fields in this structure can be accessed without +locking for tracing purposes. +Yes, this can result in confusing traces, but better some tracing confusion +than to be heisenbugged out of existence. + +<h5>Grace-Period Tracking</h5> + +<p>This portion of the <tt>rcu_node</tt> structure is declared +as follows: + +<pre> + 1 unsigned long gpnum; + 2 unsigned long completed; +</pre> + +<p>These fields are the counterparts of the fields of the same name in +the <tt>rcu_state</tt> structure. +They each may lag up to one behind their <tt>rcu_state</tt> +counterparts. +If a given <tt>rcu_node</tt> structure's <tt>->gpnum</tt> and +<tt>->complete</tt> fields are equal, then this <tt>rcu_node</tt> +structure believes that RCU is idle. +Otherwise, as with the <tt>rcu_state</tt> structure, +the <tt>->gpnum</tt> field will be one greater than the +<tt>->complete</tt> fields, with <tt>->gpnum</tt> +indicating which grace period this <tt>rcu_node</tt> believes +is still being waited for. + +</p><p>The <tt>>gpnum</tt> field of each <tt>rcu_node</tt> +structure is updated at the beginning +of each grace period, and the <tt>->completed</tt> fields are +updated at the end of each grace period. + +<h5>Quiescent-State Tracking</h5> + +<p>These fields manage the propagation of quiescent states up the +combining tree. + +</p><p>This portion of the <tt>rcu_node</tt> structure has fields +as follows: + +<pre> + 1 unsigned long qsmask; + 2 unsigned long expmask; + 3 unsigned long qsmaskinit; + 4 unsigned long expmaskinit; +</pre> + +<p>The <tt>->qsmask</tt> field tracks which of this +<tt>rcu_node</tt> structure's children still need to report +quiescent states for the current normal grace period. +Such children will have a value of 1 in their corresponding bit. +Note that the leaf <tt>rcu_node</tt> structures should be +thought of as having <tt>rcu_data</tt> structures as their +children. +Similarly, the <tt>->expmask</tt> field tracks which +of this <tt>rcu_node</tt> structure's children still need to report +quiescent states for the current expedited grace period. +An expedited grace period has +the same conceptual properties as a normal grace period, but the +expedited implementation accepts extreme CPU overhead to obtain +much lower grace-period latency, for example, consuming a few +tens of microseconds worth of CPU time to reduce grace-period +duration from milliseconds to tens of microseconds. +The <tt>->qsmaskinit</tt> field tracks which of this +<tt>rcu_node</tt> structure's children cover for at least +one online CPU. +This mask is used to initialize <tt>->qsmask</tt>, +and <tt>->expmaskinit</tt> is used to initialize +<tt>->expmask</tt> and the beginning of the +normal and expedited grace periods, respectively. + +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + Why are these bitmasks protected by locking? + Come on, haven't you heard of atomic instructions??? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + Lockless grace-period computation! Such a tantalizing possibility! + </font> + + <p><font color="ffffff">But consider the following sequence of events: + </font> + + <ol> + <li> <font color="ffffff">CPU 0 has been in dyntick-idle + mode for quite some time. + When it wakes up, it notices that the current RCU + grace period needs it to report in, so it sets a + flag where the scheduling clock interrupt will find it. + </font><p> + <li> <font color="ffffff">Meanwhile, CPU 1 is running + <tt>force_quiescent_state()</tt>, + and notices that CPU 0 has been in dyntick idle mode, + which qualifies as an extended quiescent state. + </font><p> + <li> <font color="ffffff">CPU 0's scheduling clock + interrupt fires in the + middle of an RCU read-side critical section, and notices + that the RCU core needs something, so commences RCU softirq + processing. + </font> + <p> + <li> <font color="ffffff">CPU 0's softirq handler + executes and is just about ready + to report its quiescent state up the <tt>rcu_node</tt> + tree. + </font><p> + <li> <font color="ffffff">But CPU 1 beats it to the punch, + completing the current + grace period and starting a new one. + </font><p> + <li> <font color="ffffff">CPU 0 now reports its quiescent + state for the wrong + grace period. + That grace period might now end before the RCU read-side + critical section. + If that happens, disaster will ensue. + </font> + </ol> + + <p><font color="ffffff">So the locking is absolutely required in + order to coordinate + clearing of the bits with the grace-period numbers in + <tt>->gpnum</tt> and <tt>->completed</tt>. +</font></td></tr> +<tr><td> </td></tr> +</table> + +<h5>Blocked-Task Management</h5> + +<p><tt>PREEMPT_RCU</tt> allows tasks to be preempted in the +midst of their RCU read-side critical sections, and these tasks +must be tracked explicitly. +The details of exactly why and how they are tracked will be covered +in a separate article on RCU read-side processing. +For now, it is enough to know that the <tt>rcu_node</tt> +structure tracks them. + +<pre> + 1 struct list_head blkd_tasks; + 2 struct list_head *gp_tasks; + 3 struct list_head *exp_tasks; + 4 bool wait_blkd_tasks; +</pre> + +<p>The <tt>->blkd_tasks</tt> field is a list header for +the list of blocked and preempted tasks. +As tasks undergo context switches within RCU read-side critical +sections, their <tt>task_struct</tt> structures are enqueued +(via the <tt>task_struct</tt>'s <tt>->rcu_node_entry</tt> +field) onto the head of the <tt>->blkd_tasks</tt> list for the +leaf <tt>rcu_node</tt> structure corresponding to the CPU +on which the outgoing context switch executed. +As these tasks later exit their RCU read-side critical sections, +they remove themselves from the list. +This list is therefore in reverse time order, so that if one of the tasks +is blocking the current grace period, all subsequent tasks must +also be blocking that same grace period. +Therefore, a single pointer into this list suffices to track +all tasks blocking a given grace period. +That pointer is stored in <tt>->gp_tasks</tt> for normal +grace periods and in <tt>->exp_tasks</tt> for expedited +grace periods. +These last two fields are <tt>NULL</tt> if either there is +no grace period in flight or if there are no blocked tasks +preventing that grace period from completing. +If either of these two pointers is referencing a task that +removes itself from the <tt>->blkd_tasks</tt> list, +then that task must advance the pointer to the next task on +the list, or set the pointer to <tt>NULL</tt> if there +are no subsequent tasks on the list. + +</p><p>For example, suppose that tasks T1, T2, and T3 are +all hard-affinitied to the largest-numbered CPU in the system. +Then if task T1 blocked in an RCU read-side +critical section, then an expedited grace period started, +then task T2 blocked in an RCU read-side critical section, +then a normal grace period started, and finally task 3 blocked +in an RCU read-side critical section, then the state of the +last leaf <tt>rcu_node</tt> structure's blocked-task list +would be as shown below: + +</p><p><img src="blkd_task.svg" alt="blkd_task.svg" width="60%"> + +</p><p>Task T1 is blocking both grace periods, task T2 is +blocking only the normal grace period, and task T3 is blocking +neither grace period. +Note that these tasks will not remove themselves from this list +immediately upon resuming execution. +They will instead remain on the list until they execute the outermost +<tt>rcu_read_unlock()</tt> that ends their RCU read-side critical +section. + +<p> +The <tt>->wait_blkd_tasks</tt> field indicates whether or not +the current grace period is waiting on a blocked task. + +<h5>Sizing the <tt>rcu_node</tt> Array</h5> + +<p>The <tt>rcu_node</tt> array is sized via a series of +C-preprocessor expressions as follows: + +<pre> + 1 #ifdef CONFIG_RCU_FANOUT + 2 #define RCU_FANOUT CONFIG_RCU_FANOUT + 3 #else + 4 # ifdef CONFIG_64BIT + 5 # define RCU_FANOUT 64 + 6 # else + 7 # define RCU_FANOUT 32 + 8 # endif + 9 #endif +10 +11 #ifdef CONFIG_RCU_FANOUT_LEAF +12 #define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF +13 #else +14 # ifdef CONFIG_64BIT +15 # define RCU_FANOUT_LEAF 64 +16 # else +17 # define RCU_FANOUT_LEAF 32 +18 # endif +19 #endif +20 +21 #define RCU_FANOUT_1 (RCU_FANOUT_LEAF) +22 #define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT) +23 #define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT) +24 #define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT) +25 +26 #if NR_CPUS <= RCU_FANOUT_1 +27 # define RCU_NUM_LVLS 1 +28 # define NUM_RCU_LVL_0 1 +29 # define NUM_RCU_NODES NUM_RCU_LVL_0 +30 # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } +31 # define RCU_NODE_NAME_INIT { "rcu_node_0" } +32 # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } +33 # define RCU_EXP_NAME_INIT { "rcu_node_exp_0" } +34 #elif NR_CPUS <= RCU_FANOUT_2 +35 # define RCU_NUM_LVLS 2 +36 # define NUM_RCU_LVL_0 1 +37 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) +38 # define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1) +39 # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } +40 # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } +41 # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } +42 # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" } +43 #elif NR_CPUS <= RCU_FANOUT_3 +44 # define RCU_NUM_LVLS 3 +45 # define NUM_RCU_LVL_0 1 +46 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) +47 # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) +48 # define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2) +49 # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } +50 # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } +51 # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } +52 # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" } +53 #elif NR_CPUS <= RCU_FANOUT_4 +54 # define RCU_NUM_LVLS 4 +55 # define NUM_RCU_LVL_0 1 +56 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) +57 # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) +58 # define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) +59 # define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) +60 # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } +61 # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } +62 # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } +63 # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" } +64 #else +65 # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" +66 #endif +</pre> + +<p>The maximum number of levels in the <tt>rcu_node</tt> structure +is currently limited to four, as specified by lines 21-24 +and the structure of the subsequent “if” statement. +For 32-bit systems, this allows 16*32*32*32=524,288 CPUs, which +should be sufficient for the next few years at least. +For 64-bit systems, 16*64*64*64=4,194,304 CPUs is allowed, which +should see us through the next decade or so. +This four-level tree also allows kernels built with +<tt>CONFIG_RCU_FANOUT=8</tt> to support up to 4096 CPUs, +which might be useful in very large systems having eight CPUs per +socket (but please note that no one has yet shown any measurable +performance degradation due to misaligned socket and <tt>rcu_node</tt> +boundaries). +In addition, building kernels with a full four levels of <tt>rcu_node</tt> +tree permits better testing of RCU's combining-tree code. + +</p><p>The <tt>RCU_FANOUT</tt> symbol controls how many children +are permitted at each non-leaf level of the <tt>rcu_node</tt> tree. +If the <tt>CONFIG_RCU_FANOUT</tt> Kconfig option is not specified, +it is set based on the word size of the system, which is also +the Kconfig default. + +</p><p>The <tt>RCU_FANOUT_LEAF</tt> symbol controls how many CPUs are +handled by each leaf <tt>rcu_node</tt> structure. +Experience has shown that allowing a given leaf <tt>rcu_node</tt> +structure to handle 64 CPUs, as permitted by the number of bits in +the <tt>->qsmask</tt> field on a 64-bit system, results in +excessive contention for the leaf <tt>rcu_node</tt> structures' +<tt>->lock</tt> fields. +The number of CPUs per leaf <tt>rcu_node</tt> structure is therefore +limited to 16 given the default value of <tt>CONFIG_RCU_FANOUT_LEAF</tt>. +If <tt>CONFIG_RCU_FANOUT_LEAF</tt> is unspecified, the value +selected is based on the word size of the system, just as for +<tt>CONFIG_RCU_FANOUT</tt>. +Lines 11-19 perform this computation. + +</p><p>Lines 21-24 compute the maximum number of CPUs supported by +a single-level (which contains a single <tt>rcu_node</tt> structure), +two-level, three-level, and four-level <tt>rcu_node</tt> tree, +respectively, given the fanout specified by <tt>RCU_FANOUT</tt> +and <tt>RCU_FANOUT_LEAF</tt>. +These numbers of CPUs are retained in the +<tt>RCU_FANOUT_1</tt>, +<tt>RCU_FANOUT_2</tt>, +<tt>RCU_FANOUT_3</tt>, and +<tt>RCU_FANOUT_4</tt> +C-preprocessor variables, respectively. + +</p><p>These variables are used to control the C-preprocessor <tt>#if</tt> +statement spanning lines 26-66 that computes the number of +<tt>rcu_node</tt> structures required for each level of the tree, +as well as the number of levels required. +The number of levels is placed in the <tt>NUM_RCU_LVLS</tt> +C-preprocessor variable by lines 27, 35, 44, and 54. +The number of <tt>rcu_node</tt> structures for the topmost level +of the tree is always exactly one, and this value is unconditionally +placed into <tt>NUM_RCU_LVL_0</tt> by lines 28, 36, 45, and 55. +The rest of the levels (if any) of the <tt>rcu_node</tt> tree +are computed by dividing the maximum number of CPUs by the +fanout supported by the number of levels from the current level down, +rounding up. This computation is performed by lines 37, +46-47, and 56-58. +Lines 31-33, 40-42, 50-52, and 62-63 create initializers +for lockdep lock-class names. +Finally, lines 64-66 produce an error if the maximum number of +CPUs is too large for the specified fanout. + +<h3><a name="The rcu_data Structure"> +The <tt>rcu_data</tt> Structure</a></h3> + +<p>The <tt>rcu_data</tt> maintains the per-CPU state for the +corresponding flavor of RCU. +The fields in this structure may be accessed only from the corresponding +CPU (and from tracing) unless otherwise stated. +This structure is the +focus of quiescent-state detection and RCU callback queuing. +It also tracks its relationship to the corresponding leaf +<tt>rcu_node</tt> structure to allow more-efficient +propagation of quiescent states up the <tt>rcu_node</tt> +combining tree. +Like the <tt>rcu_node</tt> structure, it provides a local +copy of the grace-period information to allow for-free +synchronized +access to this information from the corresponding CPU. +Finally, this structure records past dyntick-idle state +for the corresponding CPU and also tracks statistics. + +</p><p>The <tt>rcu_data</tt> structure's fields are discussed, +singly and in groups, in the following sections. + +<h5>Connection to Other Data Structures</h5> + +<p>This portion of the <tt>rcu_data</tt> structure is declared +as follows: + +<pre> + 1 int cpu; + 2 struct rcu_state *rsp; + 3 struct rcu_node *mynode; + 4 struct rcu_dynticks *dynticks; + 5 unsigned long grpmask; + 6 bool beenonline; +</pre> + +<p>The <tt>->cpu</tt> field contains the number of the +corresponding CPU, the <tt>->rsp</tt> pointer references +the corresponding <tt>rcu_state</tt> structure (and is most frequently +used to locate the name of the corresponding flavor of RCU for tracing), +and the <tt>->mynode</tt> field references the corresponding +<tt>rcu_node</tt> structure. +The <tt>->mynode</tt> is used to propagate quiescent states +up the combining tree. +<p>The <tt>->dynticks</tt> pointer references the +<tt>rcu_dynticks</tt> structure corresponding to this +CPU. +Recall that a single per-CPU instance of the <tt>rcu_dynticks</tt> +structure is shared among all flavors of RCU. +These first four fields are constant and therefore require not +synchronization. + +</p><p>The <tt>->grpmask</tt> field indicates the bit in +the <tt>->mynode->qsmask</tt> corresponding to this +<tt>rcu_data</tt> structure, and is also used when propagating +quiescent states. +The <tt>->beenonline</tt> flag is set whenever the corresponding +CPU comes online, which means that the debugfs tracing need not dump +out any <tt>rcu_data</tt> structure for which this flag is not set. + +<h5>Quiescent-State and Grace-Period Tracking</h5> + +<p>This portion of the <tt>rcu_data</tt> structure is declared +as follows: + +<pre> + 1 unsigned long completed; + 2 unsigned long gpnum; + 3 bool cpu_no_qs; + 4 bool core_needs_qs; + 5 bool gpwrap; + 6 unsigned long rcu_qs_ctr_snap; +</pre> + +<p>The <tt>completed</tt> and <tt>gpnum</tt> +fields are the counterparts of the fields of the same name +in the <tt>rcu_state</tt> and <tt>rcu_node</tt> structures. +They may each lag up to one behind their <tt>rcu_node</tt> +counterparts, but in <tt>CONFIG_NO_HZ_IDLE</tt> and +<tt>CONFIG_NO_HZ_FULL</tt> kernels can lag +arbitrarily far behind for CPUs in dyntick-idle mode (but these counters +will catch up upon exit from dyntick-idle mode). +If a given <tt>rcu_data</tt> structure's <tt>->gpnum</tt> and +<tt>->complete</tt> fields are equal, then this <tt>rcu_data</tt> +structure believes that RCU is idle. +Otherwise, as with the <tt>rcu_state</tt> and <tt>rcu_node</tt> +structure, +the <tt>->gpnum</tt> field will be one greater than the +<tt>->complete</tt> fields, with <tt>->gpnum</tt> +indicating which grace period this <tt>rcu_data</tt> believes +is still being waited for. + +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + All this replication of the grace period numbers can only cause + massive confusion. + Why not just keep a global pair of counters and be done with it??? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + Because if there was only a single global pair of grace-period + numbers, there would need to be a single global lock to allow + safely accessing and updating them. + And if we are not going to have a single global lock, we need + to carefully manage the numbers on a per-node basis. + Recall from the answer to a previous Quick Quiz that the consequences + of applying a previously sampled quiescent state to the wrong + grace period are quite severe. +</font></td></tr> +<tr><td> </td></tr> +</table> + +<p>The <tt>->cpu_no_qs</tt> flag indicates that the +CPU has not yet passed through a quiescent state, +while the <tt>->core_needs_qs</tt> flag indicates that the +RCU core needs a quiescent state from the corresponding CPU. +The <tt>->gpwrap</tt> field indicates that the corresponding +CPU has remained idle for so long that the <tt>completed</tt> +and <tt>gpnum</tt> counters are in danger of overflow, which +will cause the CPU to disregard the values of its counters on +its next exit from idle. +Finally, the <tt>rcu_qs_ctr_snap</tt> field is used to detect +cases where a given operation has resulted in a quiescent state +for all flavors of RCU, for example, <tt>cond_resched_rcu_qs()</tt>. + +<h5>RCU Callback Handling</h5> + +<p>In the absence of CPU-hotplug events, RCU callbacks are invoked by +the same CPU that registered them. +This is strictly a cache-locality optimization: callbacks can and +do get invoked on CPUs other than the one that registered them. +After all, if the CPU that registered a given callback has gone +offline before the callback can be invoked, there really is no other +choice. + +</p><p>This portion of the <tt>rcu_data</tt> structure is declared +as follows: + +<pre> + 1 struct rcu_head *nxtlist; + 2 struct rcu_head **nxttail[RCU_NEXT_SIZE]; + 3 unsigned long nxtcompleted[RCU_NEXT_SIZE]; + 4 long qlen_lazy; + 5 long qlen; + 6 long qlen_last_fqs_check; + 7 unsigned long n_force_qs_snap; + 8 unsigned long n_cbs_invoked; + 9 unsigned long n_cbs_orphaned; +10 unsigned long n_cbs_adopted; +11 long blimit; +</pre> + +<p>The <tt>->nxtlist</tt> pointer and the +<tt>->nxttail[]</tt> array form a four-segment list with +older callbacks near the head and newer ones near the tail. +Each segment contains callbacks with the corresponding relationship +to the current grace period. +The pointer out of the end of each of the four segments is referenced +by the element of the <tt>->nxttail[]</tt> array indexed by +<tt>RCU_DONE_TAIL</tt> (for callbacks handled by a prior grace period), +<tt>RCU_WAIT_TAIL</tt> (for callbacks waiting on the current grace period), +<tt>RCU_NEXT_READY_TAIL</tt> (for callbacks that will wait on the next +grace period), and +<tt>RCU_NEXT_TAIL</tt> (for callbacks that are not yet associated +with a specific grace period) +respectively, as shown in the following figure. + +</p><p><img src="nxtlist.svg" alt="nxtlist.svg" width="40%"> + +</p><p>In this figure, the <tt>->nxtlist</tt> pointer references the +first +RCU callback in the list. +The <tt>->nxttail[RCU_DONE_TAIL]</tt> array element references +the <tt>->nxtlist</tt> pointer itself, indicating that none +of the callbacks is ready to invoke. +The <tt>->nxttail[RCU_WAIT_TAIL]</tt> array element references callback +CB 2's <tt>->next</tt> pointer, which indicates that +CB 1 and CB 2 are both waiting on the current grace period. +The <tt>->nxttail[RCU_NEXT_READY_TAIL]</tt> array element +references the same RCU callback that <tt>->nxttail[RCU_WAIT_TAIL]</tt> +does, which indicates that there are no callbacks waiting on the next +RCU grace period. +The <tt>->nxttail[RCU_NEXT_TAIL]</tt> array element references +CB 4's <tt>->next</tt> pointer, indicating that all the +remaining RCU callbacks have not yet been assigned to an RCU grace +period. +Note that the <tt>->nxttail[RCU_NEXT_TAIL]</tt> array element +always references the last RCU callback's <tt>->next</tt> pointer +unless the callback list is empty, in which case it references +the <tt>->nxtlist</tt> pointer. + +</p><p>CPUs advance their callbacks from the +<tt>RCU_NEXT_TAIL</tt> to the <tt>RCU_NEXT_READY_TAIL</tt> to the +<tt>RCU_WAIT_TAIL</tt> to the <tt>RCU_DONE_TAIL</tt> list segments +as grace periods advance. +The CPU advances the callbacks in its <tt>rcu_data</tt> structure +whenever it notices that another RCU grace period has completed. +The CPU detects the completion of an RCU grace period by noticing +that the value of its <tt>rcu_data</tt> structure's +<tt>->completed</tt> field differs from that of its leaf +<tt>rcu_node</tt> structure. +Recall that each <tt>rcu_node</tt> structure's +<tt>->completed</tt> field is updated at the end of each +grace period. + +</p><p>The <tt>->nxtcompleted[]</tt> array records grace-period +numbers corresponding to the list segments. +This allows CPUs that go idle for extended periods to determine +which of their callbacks are ready to be invoked after reawakening. + +</p><p>The <tt>->qlen</tt> counter contains the number of +callbacks in <tt>->nxtlist</tt>, and the +<tt>->qlen_lazy</tt> contains the number of those callbacks that +are known to only free memory, and whose invocation can therefore +be safely deferred. +The <tt>->qlen_last_fqs_check</tt> and +<tt>->n_force_qs_snap</tt> coordinate the forcing of quiescent +states from <tt>call_rcu()</tt> and friends when callback +lists grow excessively long. + +</p><p>The <tt>->n_cbs_invoked</tt>, +<tt>->n_cbs_orphaned</tt>, and <tt>->n_cbs_adopted</tt> +fields count the number of callbacks invoked, +sent to other CPUs when this CPU goes offline, +and received from other CPUs when those other CPUs go offline. +Finally, the <tt>->blimit</tt> counter is the maximum number of +RCU callbacks that may be invoked at a given time. + +<h5>Dyntick-Idle Handling</h5> + +<p>This portion of the <tt>rcu_data</tt> structure is declared +as follows: + +<pre> + 1 int dynticks_snap; + 2 unsigned long dynticks_fqs; +</pre> + +The <tt>->dynticks_snap</tt> field is used to take a snapshot +of the corresponding CPU's dyntick-idle state when forcing +quiescent states, and is therefore accessed from other CPUs. +Finally, the <tt>->dynticks_fqs</tt> field is used to +count the number of times this CPU is determined to be in +dyntick-idle state, and is used for tracing and debugging purposes. + +<h3><a name="The rcu_dynticks Structure"> +The <tt>rcu_dynticks</tt> Structure</a></h3> + +<p>The <tt>rcu_dynticks</tt> maintains the per-CPU dyntick-idle state +for the corresponding CPU. +Unlike the other structures, <tt>rcu_dynticks</tt> is not +replicated over the different flavors of RCU. +The fields in this structure may be accessed only from the corresponding +CPU (and from tracing) unless otherwise stated. +Its fields are as follows: + +<pre> + 1 int dynticks_nesting; + 2 int dynticks_nmi_nesting; + 3 atomic_t dynticks; +</pre> + +<p>The <tt>->dynticks_nesting</tt> field counts the +nesting depth of normal interrupts. +In addition, this counter is incremented when exiting dyntick-idle +mode and decremented when entering it. +This counter can therefore be thought of as counting the number +of reasons why this CPU cannot be permitted to enter dyntick-idle +mode, aside from non-maskable interrupts (NMIs). +NMIs are counted by the <tt>->dynticks_nmi_nesting</tt> +field, except that NMIs that interrupt non-dyntick-idle execution +are not counted. + +</p><p>Finally, the <tt>->dynticks</tt> field counts the corresponding +CPU's transitions to and from dyntick-idle mode, so that this counter +has an even value when the CPU is in dyntick-idle mode and an odd +value otherwise. + +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + Why not just count all NMIs? + Wouldn't that be simpler and less error prone? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + It seems simpler only until you think hard about how to go about + updating the <tt>rcu_dynticks</tt> structure's + <tt>->dynticks</tt> field. +</font></td></tr> +<tr><td> </td></tr> +</table> + +<p>Additional fields are present for some special-purpose +builds, and are discussed separately. + +<h3><a name="The rcu_head Structure"> +The <tt>rcu_head</tt> Structure</a></h3> + +<p>Each <tt>rcu_head</tt> structure represents an RCU callback. +These structures are normally embedded within RCU-protected data +structures whose algorithms use asynchronous grace periods. +In contrast, when using algorithms that block waiting for RCU grace periods, +RCU users need not provide <tt>rcu_head</tt> structures. + +</p><p>The <tt>rcu_head</tt> structure has fields as follows: + +<pre> + 1 struct rcu_head *next; + 2 void (*func)(struct rcu_head *head); +</pre> + +<p>The <tt>->next</tt> field is used +to link the <tt>rcu_head</tt> structures together in the +lists within the <tt>rcu_data</tt> structures. +The <tt>->func</tt> field is a pointer to the function +to be called when the callback is ready to be invoked, and +this function is passed a pointer to the <tt>rcu_head</tt> +structure. +However, <tt>kfree_rcu()</tt> uses the <tt>->func</tt> +field to record the offset of the <tt>rcu_head</tt> +structure within the enclosing RCU-protected data structure. + +</p><p>Both of these fields are used internally by RCU. +From the viewpoint of RCU users, this structure is an +opaque “cookie”. + +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + Given that the callback function <tt>->func</tt> + is passed a pointer to the <tt>rcu_head</tt> structure, + how is that function supposed to find the beginning of the + enclosing RCU-protected data structure? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + In actual practice, there is a separate callback function per + type of RCU-protected data structure. + The callback function can therefore use the <tt>container_of()</tt> + macro in the Linux kernel (or other pointer-manipulation facilities + in other software environments) to find the beginning of the + enclosing structure. +</font></td></tr> +<tr><td> </td></tr> +</table> + +<h3><a name="RCU-Specific Fields in the task_struct Structure"> +RCU-Specific Fields in the <tt>task_struct</tt> Structure</a></h3> + +<p>The <tt>CONFIG_PREEMPT_RCU</tt> implementation uses some +additional fields in the <tt>task_struct</tt> structure: + +<pre> + 1 #ifdef CONFIG_PREEMPT_RCU + 2 int rcu_read_lock_nesting; + 3 union rcu_special rcu_read_unlock_special; + 4 struct list_head rcu_node_entry; + 5 struct rcu_node *rcu_blocked_node; + 6 #endif /* #ifdef CONFIG_PREEMPT_RCU */ + 7 #ifdef CONFIG_TASKS_RCU + 8 unsigned long rcu_tasks_nvcsw; + 9 bool rcu_tasks_holdout; +10 struct list_head rcu_tasks_holdout_list; +11 int rcu_tasks_idle_cpu; +12 #endif /* #ifdef CONFIG_TASKS_RCU */ +</pre> + +<p>The <tt>->rcu_read_lock_nesting</tt> field records the +nesting level for RCU read-side critical sections, and +the <tt>->rcu_read_unlock_special</tt> field is a bitmask +that records special conditions that require <tt>rcu_read_unlock()</tt> +to do additional work. +The <tt>->rcu_node_entry</tt> field is used to form lists of +tasks that have blocked within preemptible-RCU read-side critical +sections and the <tt>->rcu_blocked_node</tt> field references +the <tt>rcu_node</tt> structure whose list this task is a member of, +or <tt>NULL</tt> if it is not blocked within a preemptible-RCU +read-side critical section. + +<p>The <tt>->rcu_tasks_nvcsw</tt> field tracks the number of +voluntary context switches that this task had undergone at the +beginning of the current tasks-RCU grace period, +<tt>->rcu_tasks_holdout</tt> is set if the current tasks-RCU +grace period is waiting on this task, <tt>->rcu_tasks_holdout_list</tt> +is a list element enqueuing this task on the holdout list, +and <tt>->rcu_tasks_idle_cpu</tt> tracks which CPU this +idle task is running, but only if the task is currently running, +that is, if the CPU is currently idle. + +<h3><a name="Accessor Functions"> +Accessor Functions</a></h3> + +<p>The following listing shows the +<tt>rcu_get_root()</tt>, <tt>rcu_for_each_node_breadth_first</tt>, +<tt>rcu_for_each_nonleaf_node_breadth_first()</tt>, and +<tt>rcu_for_each_leaf_node()</tt> function and macros: + +<pre> + 1 static struct rcu_node *rcu_get_root(struct rcu_state *rsp) + 2 { + 3 return &rsp->node[0]; + 4 } + 5 + 6 #define rcu_for_each_node_breadth_first(rsp, rnp) \ + 7 for ((rnp) = &(rsp)->node[0]; \ + 8 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) + 9 + 10 #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ + 11 for ((rnp) = &(rsp)->node[0]; \ + 12 (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++) + 13 + 14 #define rcu_for_each_leaf_node(rsp, rnp) \ + 15 for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \ + 16 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) +</pre> + +<p>The <tt>rcu_get_root()</tt> simply returns a pointer to the +first element of the specified <tt>rcu_state</tt> structure's +<tt>->node[]</tt> array, which is the root <tt>rcu_node</tt> +structure. + +</p><p>As noted earlier, the <tt>rcu_for_each_node_breadth_first()</tt> +macro takes advantage of the layout of the <tt>rcu_node</tt> +structures in the <tt>rcu_state</tt> structure's +<tt>->node[]</tt> array, performing a breadth-first traversal by +simply traversing the array in order. +The <tt>rcu_for_each_nonleaf_node_breadth_first()</tt> macro operates +similarly, but traverses only the first part of the array, thus excluding +the leaf <tt>rcu_node</tt> structures. +Finally, the <tt>rcu_for_each_leaf_node()</tt> macro traverses only +the last part of the array, thus traversing only the leaf +<tt>rcu_node</tt> structures. + +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + What do <tt>rcu_for_each_nonleaf_node_breadth_first()</tt> and + <tt>rcu_for_each_leaf_node()</tt> do if the <tt>rcu_node</tt> tree + contains only a single node? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + In the single-node case, + <tt>rcu_for_each_nonleaf_node_breadth_first()</tt> is a no-op + and <tt>rcu_for_each_leaf_node()</tt> traverses the single node. +</font></td></tr> +<tr><td> </td></tr> +</table> + +<h3><a name="Summary"> +Summary</a></h3> + +So each flavor of RCU is represented by an <tt>rcu_state</tt> structure, +which contains a combining tree of <tt>rcu_node</tt> and +<tt>rcu_data</tt> structures. +Finally, in <tt>CONFIG_NO_HZ_IDLE</tt> kernels, each CPU's dyntick-idle +state is tracked by an <tt>rcu_dynticks</tt> structure. + +If you made it this far, you are well prepared to read the code +walkthroughs in the other articles in this series. + +<h3><a name="Acknowledgments"> +Acknowledgments</a></h3> + +I owe thanks to Cyrill Gorcunov, Mathieu Desnoyers, Dhaval Giani, Paul +Turner, Abhishek Srivastava, Matt Kowalczyk, and Serge Hallyn +for helping me get this document into a more human-readable state. + +<h3><a name="Legal Statement"> +Legal Statement</a></h3> + +<p>This work represents the view of the author and does not necessarily +represent the view of IBM. + +</p><p>Linux is a registered trademark of Linus Torvalds. + +</p><p>Other company, product, and service names may be trademarks or +service marks of others. + +</body></html> diff --git a/Documentation/RCU/Design/Data-Structures/HugeTreeClassicRCU.svg b/Documentation/RCU/Design/Data-Structures/HugeTreeClassicRCU.svg new file mode 100644 index 000000000000..2bf12b468206 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/HugeTreeClassicRCU.svg @@ -0,0 +1,939 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Creator: fig2dev Version 3.2 Patchlevel 5e --> + +<!-- CreationDate: Wed Dec 9 17:37:22 2015 --> + +<!-- Magnification: 3.000 --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="15.1in" + height="11.2in" + viewBox="-66 -66 18087 13407" + id="svg2" + version="1.1" + inkscape:version="0.48.4 r9939" + sodipodi:docname="HugeTreeClassicRCU.fig"> + <metadata + id="metadata224"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title></dc:title> + </cc:Work> + </rdf:RDF> + </metadata> + <defs + id="defs222"> + <marker + inkscape:stockid="Arrow1Mend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow1Mend" + style="overflow:visible;"> + <path + id="path3982" + d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z " + style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;" + transform="scale(0.4) rotate(180) translate(10,0)" /> + </marker> + </defs> + <sodipodi:namedview + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1" + objecttolerance="10" + gridtolerance="10" + guidetolerance="10" + inkscape:pageopacity="0" + inkscape:pageshadow="2" + inkscape:window-width="1134" + inkscape:window-height="789" + id="namedview220" + showgrid="false" + inkscape:zoom="0.60515873" + inkscape:cx="679.5" + inkscape:cy="504" + inkscape:window-x="786" + inkscape:window-y="24" + inkscape:window-maximized="0" + inkscape:current-layer="g4" /> + <g + style="stroke-width:.025in; fill:none" + id="g4"> + <!-- Line: box --> + <rect + x="450" + y="0" + width="17100" + height="8325" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect6" /> + <!-- Line: box --> + <rect + x="11025" + y="3600" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect8" /> + <!-- Line: box --> + <rect + x="4275" + y="3600" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect10" /> + <!-- Line: box --> + <rect + x="5400" + y="6300" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect12" /> + <!-- Line: box --> + <rect + x="9900" + y="6300" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect14" /> + <!-- Line: box --> + <rect + x="14400" + y="6300" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect16" /> + <!-- Line: box --> + <rect + x="900" + y="6300" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect18" /> + <!-- Line: box --> + <rect + x="7650" + y="900" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect20" /> + <!-- Line --> + <polyline + points="3150,9225 3150,7746 " + style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline22" /> + <!-- Arrowhead on XXXpoint 3150 9225 - 3150 7560--> + <!-- Circle --> + <circle + cx="8550" + cy="4275" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle26" /> + <!-- Circle --> + <circle + cx="9000" + cy="4275" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle28" /> + <!-- Circle --> + <circle + cx="9450" + cy="4275" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle30" /> + <!-- Line --> + <polyline + points="6750,6300 8250,5010 " + style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline32" /> + <!-- Arrowhead on XXXpoint 6750 6300 - 8391 4890--> + <!-- Line --> + <polyline + points="11250,6300 9747,5010 " + style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline36" /> + <!-- Arrowhead on XXXpoint 11250 6300 - 9606 4890--> + <!-- Circle --> + <circle + cx="13950" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle40" /> + <!-- Circle --> + <circle + cx="13500" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle42" /> + <!-- Circle --> + <circle + cx="13050" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle44" /> + <!-- Circle --> + <circle + cx="9450" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle46" /> + <!-- Circle --> + <circle + cx="9000" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle48" /> + <!-- Circle --> + <circle + cx="8550" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle50" /> + <!-- Circle --> + <circle + cx="4950" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle52" /> + <!-- Circle --> + <circle + cx="4500" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle54" /> + <!-- Circle --> + <circle + cx="4050" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle56" /> + <!-- Circle --> + <circle + cx="1800" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle58" /> + <!-- Circle --> + <circle + cx="2250" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle60" /> + <!-- Circle --> + <circle + cx="2700" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle62" /> + <!-- Circle --> + <circle + cx="15300" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle64" /> + <!-- Circle --> + <circle + cx="15750" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle66" /> + <!-- Circle --> + <circle + cx="16200" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle68" /> + <!-- Circle --> + <circle + cx="10800" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle70" /> + <!-- Circle --> + <circle + cx="11250" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle72" /> + <!-- Circle --> + <circle + cx="11700" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle74" /> + <!-- Circle --> + <circle + cx="6300" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle76" /> + <!-- Circle --> + <circle + cx="6750" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle78" /> + <!-- Circle --> + <circle + cx="7200" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle80" /> + <!-- Line: box --> + <rect + x="0" + y="11475" + width="2700" + height="1800" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect82" /> + <!-- Line: box --> + <rect + x="1800" + y="9225" + width="2700" + height="1800" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect84" /> + <!-- Line: box --> + <rect + x="4500" + y="11475" + width="2700" + height="1800" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect86" /> + <!-- Line: box --> + <rect + x="6300" + y="9270" + width="2700" + height="1800" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect88" /> + <!-- Line: box --> + <rect + x="8955" + y="11475" + width="2700" + height="1800" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect90" /> + <!-- Line: box --> + <rect + x="10755" + y="9270" + width="2700" + height="1800" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect92" /> + <!-- Line: box --> + <rect + x="13455" + y="11475" + width="2700" + height="1800" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect94" /> + <!-- Line: box --> + <rect + x="15255" + y="9270" + width="2700" + height="1800" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect96" /> + <!-- Line --> + <polyline + points="11700,3600 10197,2310 " + style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline98" /> + <!-- Arrowhead on XXXpoint 11700 3600 - 10056 2190--> + <!-- Line --> + <polyline + points="6300,3600 7800,2310 " + style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline102" /> + <!-- Arrowhead on XXXpoint 6300 3600 - 7941 2190--> + <!-- Line --> + <polyline + points="3150,6300 4650,5010 " + style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline106" /> + <!-- Arrowhead on XXXpoint 3150 6300 - 4791 4890--> + <!-- Line --> + <polyline + points="14850,6300 13347,5010 " + style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline110" /> + <!-- Arrowhead on XXXpoint 14850 6300 - 13206 4890--> + <!-- Line --> + <polyline + points="1350,11475 1350,7746 " + style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline114" /> + <!-- Arrowhead on XXXpoint 1350 11475 - 1350 7560--> + <!-- Line --> + <polyline + points="16650,9225 16650,7746 " + style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline118" /> + <!-- Arrowhead on XXXpoint 16650 9225 - 16650 7560--> + <!-- Line --> + <polyline + points="14850,11475 14850,7746 " + style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline122" /> + <!-- Arrowhead on XXXpoint 14850 11475 - 14850 7560--> + <!-- Line --> + <polyline + points="12150,9225 12150,7746 " + style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline126" /> + <!-- Arrowhead on XXXpoint 12150 9225 - 12150 7560--> + <!-- Line --> + <polyline + points="10350,11475 10350,7746 " + style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline130" /> + <!-- Arrowhead on XXXpoint 10350 11475 - 10350 7560--> + <!-- Line --> + <polyline + points="7650,9225 7650,7746 " + style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline134" /> + <!-- Arrowhead on XXXpoint 7650 9225 - 7650 7560--> + <!-- Line --> + <polyline + points="5850,11475 5850,7746 " + style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline138" /> + <!-- Arrowhead on XXXpoint 5850 11475 - 5850 7560--> + <!-- Text --> + <text + xml:space="preserve" + x="12375" + y="4500" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text142">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="12375" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text144">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5625" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text146">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5625" + y="4500" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text148">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="6750" + y="6750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text150">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="6750" + y="7200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text152">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="11250" + y="7200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text154">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="11250" + y="6750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text156">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="15750" + y="7200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text158">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="15750" + y="6750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text160">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2250" + y="6750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text162">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2250" + y="7200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text164">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="1350" + y="13050" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text166">CPU 0</text> + <!-- Text --> + <text + xml:space="preserve" + x="1350" + y="11925" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text168">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1350" + y="12375" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text170">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="3150" + y="10800" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text172">CPU 15</text> + <!-- Text --> + <text + xml:space="preserve" + x="3150" + y="9675" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text174">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="3150" + y="10125" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text176">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="5850" + y="11925" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text178">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5850" + y="12375" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text180">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="5850" + y="13050" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text182">CPU 21823</text> + <!-- Text --> + <text + xml:space="preserve" + x="7650" + y="10845" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text184">CPU 21839</text> + <!-- Text --> + <text + xml:space="preserve" + x="7650" + y="10170" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text186">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="7650" + y="9720" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text188">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="10305" + y="11925" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text190">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="10305" + y="12375" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text192">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="10305" + y="13050" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text194">CPU 43679</text> + <!-- Text --> + <text + xml:space="preserve" + x="12105" + y="10845" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text196">CPU 43695</text> + <!-- Text --> + <text + xml:space="preserve" + x="12105" + y="10170" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text198">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="12105" + y="9720" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text200">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="14805" + y="11925" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text202">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="14805" + y="12375" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text204">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="14805" + y="13050" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text206">CPU 65519</text> + <!-- Text --> + <text + xml:space="preserve" + x="16605" + y="10845" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text208">CPU 65535</text> + <!-- Text --> + <text + xml:space="preserve" + x="16605" + y="10170" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text210">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="16605" + y="9720" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text212">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="675" + y="450" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="start" + id="text214">struct rcu_state</text> + <!-- Text --> + <text + xml:space="preserve" + x="9000" + y="1350" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text216">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="9000" + y="1800" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text218">rcu_node</text> + </g> +</svg> diff --git a/Documentation/RCU/Design/Data-Structures/TreeLevel.svg b/Documentation/RCU/Design/Data-Structures/TreeLevel.svg new file mode 100644 index 000000000000..7a7eb3bac95c --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/TreeLevel.svg @@ -0,0 +1,828 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Creator: fig2dev Version 3.2 Patchlevel 5e --> + +<!-- CreationDate: Wed Dec 9 17:41:29 2015 --> + +<!-- Magnification: 3.000 --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="17.7in" + height="10.4in" + viewBox="-66 -66 21237 12507" + id="svg2" + version="1.1" + inkscape:version="0.48.4 r9939" + sodipodi:docname="TreeLevel.fig"> + <metadata + id="metadata216"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title></dc:title> + </cc:Work> + </rdf:RDF> + </metadata> + <defs + id="defs214"> + <marker + inkscape:stockid="Arrow1Mend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow1Mend" + style="overflow:visible;"> + <path + id="path3974" + d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z " + style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;" + transform="scale(0.4) rotate(180) translate(10,0)" /> + </marker> + </defs> + <sodipodi:namedview + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1" + objecttolerance="10" + gridtolerance="10" + guidetolerance="10" + inkscape:pageopacity="0" + inkscape:pageshadow="2" + inkscape:window-width="1023" + inkscape:window-height="1148" + id="namedview212" + showgrid="false" + inkscape:zoom="0.55869424" + inkscape:cx="796.50006" + inkscape:cy="467.99997" + inkscape:window-x="897" + inkscape:window-y="24" + inkscape:window-maximized="0" + inkscape:current-layer="g4" /> + <g + style="stroke-width:.025in; fill:none" + id="g4"> + <!-- Line: box --> + <rect + x="0" + y="0" + width="20655" + height="8325" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect6" /> + <!-- Line: box --> + <rect + x="14130" + y="3600" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect8" /> + <!-- Line: box --> + <rect + x="7380" + y="3600" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect10" /> + <!-- Line: box --> + <rect + x="8505" + y="6300" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect12" /> + <!-- Line: box --> + <rect + x="13005" + y="6300" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect14" /> + <!-- Line: box --> + <rect + x="17505" + y="6300" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect16" /> + <!-- Line: box --> + <rect + x="4005" + y="6300" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect18" /> + <!-- Line: box --> + <rect + x="10755" + y="900" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect20" /> + <!-- Line --> + <polyline + points="6255,9225 6255,7746 " + style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline22" /> + <!-- Arrowhead on XXXpoint 6255 9225 - 6255 7560--> + <!-- Circle --> + <circle + cx="11655" + cy="4275" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle26" /> + <!-- Circle --> + <circle + cx="12105" + cy="4275" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle28" /> + <!-- Circle --> + <circle + cx="12555" + cy="4275" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle30" /> + <!-- Line --> + <polyline + points="9855,6300 11355,5010 " + style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline32" /> + <!-- Arrowhead on XXXpoint 9855 6300 - 11496 4890--> + <!-- Line --> + <polyline + points="14355,6300 12852,5010 " + style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline36" /> + <!-- Arrowhead on XXXpoint 14355 6300 - 12711 4890--> + <!-- Circle --> + <circle + cx="17055" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle40" /> + <!-- Circle --> + <circle + cx="16605" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle42" /> + <!-- Circle --> + <circle + cx="16155" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle44" /> + <!-- Circle --> + <circle + cx="12555" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle46" /> + <!-- Circle --> + <circle + cx="12105" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle48" /> + <!-- Circle --> + <circle + cx="11655" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle50" /> + <!-- Circle --> + <circle + cx="8055" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle52" /> + <!-- Circle --> + <circle + cx="7605" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle54" /> + <!-- Circle --> + <circle + cx="7155" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle56" /> + <!-- Circle --> + <circle + cx="4905" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle58" /> + <!-- Circle --> + <circle + cx="5355" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle60" /> + <!-- Circle --> + <circle + cx="5805" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle62" /> + <!-- Circle --> + <circle + cx="18405" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle64" /> + <!-- Circle --> + <circle + cx="18855" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle66" /> + <!-- Circle --> + <circle + cx="19305" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle68" /> + <!-- Circle --> + <circle + cx="13905" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle70" /> + <!-- Circle --> + <circle + cx="14355" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle72" /> + <!-- Circle --> + <circle + cx="14805" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle74" /> + <!-- Circle --> + <circle + cx="9405" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle76" /> + <!-- Circle --> + <circle + cx="9855" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle78" /> + <!-- Circle --> + <circle + cx="10305" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle80" /> + <!-- Line: box --> + <rect + x="225" + y="1125" + width="3150" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:21; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect82" /> + <!-- Line: box --> + <rect + x="225" + y="2250" + width="3150" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:21; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect84" /> + <!-- Line: box --> + <rect + x="225" + y="3375" + width="3150" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:21; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect86" /> + <!-- Line --> + <polyline + points="14805,3600 13302,2310 " + style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline88" /> + <!-- Arrowhead on XXXpoint 14805 3600 - 13161 2190--> + <!-- Line --> + <polyline + points="9405,3600 10905,2310 " + style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline92" /> + <!-- Arrowhead on XXXpoint 9405 3600 - 11046 2190--> + <!-- Line --> + <polyline + points="6255,6300 7755,5010 " + style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline96" /> + <!-- Arrowhead on XXXpoint 6255 6300 - 7896 4890--> + <!-- Line --> + <polyline + points="17955,6300 16452,5010 " + style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline100" /> + <!-- Arrowhead on XXXpoint 17955 6300 - 16311 4890--> + <!-- Line --> + <polyline + points="4455,11025 4455,7746 " + style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline104" /> + <!-- Arrowhead on XXXpoint 4455 11025 - 4455 7560--> + <!-- Line --> + <polyline + points="19755,9225 19755,7746 " + style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline108" /> + <!-- Arrowhead on XXXpoint 19755 9225 - 19755 7560--> + <!-- Line --> + <polyline + points="17955,11025 17955,7746 " + style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline112" /> + <!-- Arrowhead on XXXpoint 17955 11025 - 17955 7560--> + <!-- Line --> + <polyline + points="15255,9225 15255,7746 " + style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline116" /> + <!-- Arrowhead on XXXpoint 15255 9225 - 15255 7560--> + <!-- Line --> + <polyline + points="13455,11025 13455,7746 " + style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline120" /> + <!-- Arrowhead on XXXpoint 13455 11025 - 13455 7560--> + <!-- Line --> + <polyline + points="10755,9225 10755,7746 " + style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline124" /> + <!-- Arrowhead on XXXpoint 10755 9225 - 10755 7560--> + <!-- Line --> + <polyline + points="8955,11025 8955,7746 " + style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline128" /> + <!-- Arrowhead on XXXpoint 8955 11025 - 8955 7560--> + <!-- Line: box --> + <rect + x="12105" + y="11025" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect132" /> + <!-- Line: box --> + <rect + x="13905" + y="9225" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect134" /> + <!-- Line: box --> + <rect + x="16605" + y="11025" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect136" /> + <!-- Line: box --> + <rect + x="18405" + y="9225" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect138" /> + <!-- Line: box --> + <rect + x="9405" + y="9225" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect140" /> + <!-- Line: box --> + <rect + x="7605" + y="11025" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect142" /> + <!-- Line: box --> + <rect + x="4905" + y="9225" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect144" /> + <!-- Line: box --> + <rect + x="3105" + y="11025" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect146" /> + <!-- Line --> + <polyline + points="3375,1575 10701,1575 " + style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline148" /> + <!-- Arrowhead on XXXpoint 3375 1575 - 10890 1575--> + <!-- Line --> + <polyline + points="3375,3825 4050,3825 4050,5400 2700,5400 2700,6975 3951,6975 " + style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline152" /> + <!-- Arrowhead on XXXpoint 2700 6975 - 4140 6975--> + <!-- Line --> + <polyline + points="3375,2700 5175,2700 5175,4275 7326,4275 " + style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline156" /> + <!-- Arrowhead on XXXpoint 5175 4275 - 7515 4275--> + <!-- Text --> + <text + xml:space="preserve" + x="15480" + y="4500" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text160">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="15480" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text162">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="8730" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text164">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="8730" + y="4500" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text166">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="9855" + y="6750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text168">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="9855" + y="7200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text170">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="14355" + y="7200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text172">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="14355" + y="6750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text174">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="18855" + y="7200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text176">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="18855" + y="6750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text178">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5355" + y="6750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text180">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5355" + y="7200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text182">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="450" + y="1800" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="324" + text-anchor="start" + id="text184">->level[0]</text> + <!-- Text --> + <text + xml:space="preserve" + x="450" + y="2925" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="324" + text-anchor="start" + id="text186">->level[1]</text> + <!-- Text --> + <text + xml:space="preserve" + x="450" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="324" + text-anchor="start" + id="text188">->level[2]</text> + <!-- Text --> + <text + xml:space="preserve" + x="12105" + y="1350" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text190">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="12105" + y="1800" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text192">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="6255" + y="10125" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text194">CPU 15</text> + <!-- Text --> + <text + xml:space="preserve" + x="4455" + y="11925" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text196">CPU 0</text> + <!-- Text --> + <text + xml:space="preserve" + x="19755" + y="10125" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text198">CPU 65535</text> + <!-- Text --> + <text + xml:space="preserve" + x="17955" + y="11925" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text200">CPU 65519</text> + <!-- Text --> + <text + xml:space="preserve" + x="15255" + y="10125" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text202">CPU 43695</text> + <!-- Text --> + <text + xml:space="preserve" + x="13455" + y="11925" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text204">CPU 43679</text> + <!-- Text --> + <text + xml:space="preserve" + x="10755" + y="10125" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text206">CPU 21839</text> + <!-- Text --> + <text + xml:space="preserve" + x="8955" + y="11925" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text208">CPU 21823</text> + <!-- Text --> + <text + xml:space="preserve" + x="225" + y="450" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="start" + id="text210">struct rcu_state</text> + </g> +</svg> diff --git a/Documentation/RCU/Design/Data-Structures/TreeMapping.svg b/Documentation/RCU/Design/Data-Structures/TreeMapping.svg new file mode 100644 index 000000000000..729cfa9e6cdb --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/TreeMapping.svg @@ -0,0 +1,305 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Creator: fig2dev Version 3.2 Patchlevel 5e --> + +<!-- CreationDate: Wed Dec 9 17:43:22 2015 --> + +<!-- Magnification: 1.000 --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="3.1in" + height="0.9in" + viewBox="-12 -12 3699 1074" + id="svg2" + version="1.1" + inkscape:version="0.48.4 r9939" + sodipodi:docname="TreeMapping.fig"> + <metadata + id="metadata66"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title></dc:title> + </cc:Work> + </rdf:RDF> + </metadata> + <defs + id="defs64"> + <marker + inkscape:stockid="Arrow2Lend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow2Lend" + style="overflow:visible;"> + <path + id="path3836" + style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;" + d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z " + transform="scale(1.1) rotate(180) translate(1,0)" /> + </marker> + <marker + inkscape:stockid="Arrow2Mend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow2Mend" + style="overflow:visible;"> + <path + id="path3842" + style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;" + d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z " + transform="scale(0.6) rotate(180) translate(0,0)" /> + </marker> + <marker + inkscape:stockid="Arrow1Mend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow1Mend" + style="overflow:visible;"> + <path + id="path3824" + d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z " + style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;" + transform="scale(0.4) rotate(180) translate(10,0)" /> + </marker> + </defs> + <sodipodi:namedview + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1" + objecttolerance="10" + gridtolerance="10" + guidetolerance="10" + inkscape:pageopacity="0" + inkscape:pageshadow="2" + inkscape:window-width="991" + inkscape:window-height="606" + id="namedview62" + showgrid="false" + inkscape:zoom="3.0752688" + inkscape:cx="139.5" + inkscape:cy="40.5" + inkscape:window-x="891" + inkscape:window-y="177" + inkscape:window-maximized="0" + inkscape:current-layer="g4" /> + <g + style="stroke-width:.025in; fill:none" + id="g4"> + <!-- Line: box --> + <rect + x="0" + y="0" + width="3675" + height="1050" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect6" /> + <!-- Line: box --> + <rect + x="75" + y="375" + width="375" + height="300" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect8" /> + <!-- Line: box --> + <rect + x="600" + y="375" + width="375" + height="300" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect10" /> + <!-- Line: box --> + <rect + x="1125" + y="375" + width="375" + height="300" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect12" /> + <!-- Line: box --> + <rect + x="1650" + y="375" + width="375" + height="300" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect14" /> + <!-- Line: box --> + <rect + x="2175" + y="375" + width="375" + height="300" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect16" /> + <!-- Line: box --> + <rect + x="3225" + y="375" + width="375" + height="300" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect18" /> + <!-- Line --> + <polyline + points="675,375 675,150 300,150 300,358 " + style="stroke:#000000;stroke-width:7.00088889;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline20" /> + <!-- Arrowhead on XXXpoint 300 150 - 300 390--> + <!-- Line --> + <polyline + points="1200,675 1200,900 300,900 300,691 " + style="stroke:#000000;stroke-width:7.00088889;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline24" /> + <!-- Arrowhead on XXXpoint 300 900 - 300 660--> + <!-- Line --> + <polyline + points="1725,375 1725,150 900,150 900,358 " + style="stroke:#000000;stroke-width:7.00088889;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline28" /> + <!-- Arrowhead on XXXpoint 900 150 - 900 390--> + <!-- Line --> + <polyline + points="2250,375 2250,75 825,75 825,358 " + style="stroke:#000000;stroke-width:7.00088889;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline32" /> + <!-- Arrowhead on XXXpoint 825 75 - 825 390--> + <!-- Line --> + <polyline + points="2775,675 2775,900 1425,900 1425,691 " + style="stroke:#000000;stroke-width:7.00088889;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline36" /> + <!-- Arrowhead on XXXpoint 1425 900 - 1425 660--> + <!-- Line --> + <polyline + points="3300,675 3300,975 1350,975 1350,691 " + style="stroke:#000000;stroke-width:7.00088889;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline40" /> + <!-- Arrowhead on XXXpoint 1350 975 - 1350 660--> + <!-- Line: box --> + <rect + x="2700" + y="375" + width="375" + height="300" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect44" /> + <!-- Text --> + <text + xml:space="preserve" + x="300" + y="525" + fill="#000000" + font-family="Times" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="middle" + id="text46">0:7 </text> + <!-- Text --> + <text + xml:space="preserve" + x="1350" + y="525" + fill="#000000" + font-family="Times" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="middle" + id="text48">4:7 </text> + <!-- Text --> + <text + xml:space="preserve" + x="1875" + y="525" + fill="#000000" + font-family="Times" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="middle" + id="text50">0:1 </text> + <!-- Text --> + <text + xml:space="preserve" + x="2400" + y="525" + fill="#000000" + font-family="Times" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="middle" + id="text52">2:3 </text> + <!-- Text --> + <text + xml:space="preserve" + x="2925" + y="525" + fill="#000000" + font-family="Times" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="middle" + id="text54">4:5 </text> + <!-- Text --> + <text + xml:space="preserve" + x="3450" + y="525" + fill="#000000" + font-family="Times" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="middle" + id="text56">6:7 </text> + <!-- Text --> + <text + xml:space="preserve" + x="825" + y="525" + fill="#000000" + font-family="Times" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="middle" + id="text58">0:3 </text> + <!-- Text --> + <text + xml:space="preserve" + x="3600" + y="150" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="end" + id="text60">struct rcu_state</text> + </g> +</svg> diff --git a/Documentation/RCU/Design/Data-Structures/TreeMappingLevel.svg b/Documentation/RCU/Design/Data-Structures/TreeMappingLevel.svg new file mode 100644 index 000000000000..5b416a4b8453 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/TreeMappingLevel.svg @@ -0,0 +1,380 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Creator: fig2dev Version 3.2 Patchlevel 5e --> + +<!-- CreationDate: Wed Dec 9 17:45:19 2015 --> + +<!-- Magnification: 1.000 --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="3.1in" + height="1.8in" + viewBox="-12 -12 3699 2124" + id="svg2" + version="1.1" + inkscape:version="0.48.4 r9939" + sodipodi:docname="TreeMappingLevel.svg"> + <metadata + id="metadata98"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title /> + </cc:Work> + </rdf:RDF> + </metadata> + <defs + id="defs96"> + <marker + inkscape:stockid="Arrow2Lend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow2Lend" + style="overflow:visible;"> + <path + id="path3868" + style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;" + d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z " + transform="scale(1.1) rotate(180) translate(1,0)" /> + </marker> + </defs> + <sodipodi:namedview + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1" + objecttolerance="10" + gridtolerance="10" + guidetolerance="10" + inkscape:pageopacity="0" + inkscape:pageshadow="2" + inkscape:window-width="1598" + inkscape:window-height="1211" + id="namedview94" + showgrid="false" + inkscape:zoom="5.2508961" + inkscape:cx="139.5" + inkscape:cy="81" + inkscape:window-x="840" + inkscape:window-y="122" + inkscape:window-maximized="0" + inkscape:current-layer="g4" /> + <g + style="stroke-width:.025in; fill:none" + id="g4"> + <!-- Line: box --> + <rect + x="0" + y="0" + width="3675" + height="2100" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect6" /> + <!-- Line: box --> + <rect + x="75" + y="1350" + width="750" + height="225" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect8" /> + <!-- Line: box --> + <rect + x="75" + y="1575" + width="750" + height="225" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect10" /> + <!-- Line: box --> + <rect + x="75" + y="1800" + width="750" + height="225" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect12" /> + <!-- Arc --> + <path + style="stroke:#000000;stroke-width:7;stroke-linecap:butt;" + d="M 1800,900 A 118 118 0 0 0 1800 1125 " + id="path14" /> + <!-- Arc --> + <path + style="stroke:#000000;stroke-width:7;stroke-linecap:butt;" + d="M 750,900 A 75 75 0 0 0 750 1050 " + id="path16" /> + <!-- Line --> + <polyline + points="750,900 750,691 " + style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline18" /> + <!-- Arrowhead on XXXpoint 750 900 - 750 660--> + <!-- Line: box --> + <rect + x="75" + y="375" + width="375" + height="300" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect22" /> + <!-- Line: box --> + <rect + x="600" + y="375" + width="375" + height="300" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect24" /> + <!-- Line: box --> + <rect + x="1650" + y="375" + width="375" + height="300" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect26" /> + <!-- Line: box --> + <rect + x="2175" + y="375" + width="375" + height="300" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect28" /> + <!-- Line: box --> + <rect + x="3225" + y="375" + width="375" + height="300" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect30" /> + <!-- Line --> + <polyline + points="675,375 675,150 300,150 300,358 " + style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline32" /> + <!-- Arrowhead on XXXpoint 300 150 - 300 390--> + <!-- Line --> + <polyline + points="1725,375 1725,150 900,150 900,358 " + style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline36" /> + <!-- Arrowhead on XXXpoint 900 150 - 900 390--> + <!-- Line --> + <polyline + points="2250,375 2250,75 825,75 825,358 " + style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline40" /> + <!-- Arrowhead on XXXpoint 825 75 - 825 390--> + <!-- Line --> + <polyline + points="2775,675 2775,975 1425,975 1425,691 " + style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline44" /> + <!-- Arrowhead on XXXpoint 1425 975 - 1425 660--> + <!-- Line: box --> + <rect + x="2700" + y="375" + width="375" + height="300" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect48" /> + <!-- Line: box --> + <rect + x="1125" + y="375" + width="375" + height="300" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect50" /> + <!-- Line --> + <polyline + points="3300,675 3300,1050 1350,1050 1350,691 " + style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline52" /> + <!-- Arrowhead on XXXpoint 1350 1050 - 1350 660--> + <!-- Line --> + <polyline + points="825,1425 975,1425 975,1200 225,1200 225,691 " + style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline56" /> + <!-- Arrowhead on XXXpoint 225 1200 - 225 660--> + <!-- Line --> + <polyline + points="1200,675 1200,975 300,975 300,691 " + style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline60" /> + <!-- Arrowhead on XXXpoint 300 975 - 300 660--> + <!-- Text --> + <text + xml:space="preserve" + x="150" + y="1500" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="108" + text-anchor="start" + id="text64">->level[0]</text> + <!-- Text --> + <text + xml:space="preserve" + x="150" + y="1725" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="108" + text-anchor="start" + id="text66">->level[1]</text> + <!-- Text --> + <text + xml:space="preserve" + x="150" + y="1950" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="108" + text-anchor="start" + id="text68">->level[2]</text> + <!-- Text --> + <text + xml:space="preserve" + x="300" + y="525" + fill="#000000" + font-family="Times" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="middle" + id="text70">0:7 </text> + <!-- Text --> + <text + xml:space="preserve" + x="1350" + y="525" + fill="#000000" + font-family="Times" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="middle" + id="text72">4:7 </text> + <!-- Text --> + <text + xml:space="preserve" + x="1875" + y="525" + fill="#000000" + font-family="Times" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="middle" + id="text74">0:1 </text> + <!-- Text --> + <text + xml:space="preserve" + x="2400" + y="525" + fill="#000000" + font-family="Times" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="middle" + id="text76">2:3 </text> + <!-- Text --> + <text + xml:space="preserve" + x="2925" + y="525" + fill="#000000" + font-family="Times" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="middle" + id="text78">4:5 </text> + <!-- Text --> + <text + xml:space="preserve" + x="3450" + y="525" + fill="#000000" + font-family="Times" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="middle" + id="text80">6:7 </text> + <!-- Text --> + <text + xml:space="preserve" + x="825" + y="525" + fill="#000000" + font-family="Times" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="middle" + id="text82">0:3 </text> + <!-- Text --> + <text + xml:space="preserve" + x="3600" + y="150" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="end" + id="text84">struct rcu_state</text> + <!-- Line --> + <polyline + points="825,1875 1800,1875 1800,1125 " + style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:none" + id="polyline86" /> + <!-- Line --> + <polyline + points="1800,900 1800,691 " + style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline88" /> + <!-- Arrowhead on XXXpoint 1800 900 - 1800 660--> + <!-- Line --> + <polyline + points="825,1650 1200,1650 1200,1125 750,1125 750,1050 " + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline92" /> + </g> +</svg> diff --git a/Documentation/RCU/Design/Data-Structures/blkd_task.svg b/Documentation/RCU/Design/Data-Structures/blkd_task.svg new file mode 100644 index 000000000000..00e810bb8419 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/blkd_task.svg @@ -0,0 +1,843 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Creator: fig2dev Version 3.2 Patchlevel 5e --> + +<!-- CreationDate: Wed Dec 9 17:35:03 2015 --> + +<!-- Magnification: 2.000 --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="10.1in" + height="8.6in" + viewBox="-44 -44 12088 10288" + id="svg2" + version="1.1" + inkscape:version="0.48.4 r9939" + sodipodi:docname="blkd_task.fig"> + <metadata + id="metadata212"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title></dc:title> + </cc:Work> + </rdf:RDF> + </metadata> + <defs + id="defs210"> + <marker + inkscape:stockid="Arrow1Mend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow1Mend" + style="overflow:visible;"> + <path + id="path3970" + d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z " + style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;" + transform="scale(0.4) rotate(180) translate(10,0)" /> + </marker> + </defs> + <sodipodi:namedview + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1" + objecttolerance="10" + gridtolerance="10" + guidetolerance="10" + inkscape:pageopacity="0" + inkscape:pageshadow="2" + inkscape:window-width="1087" + inkscape:window-height="1144" + id="namedview208" + showgrid="false" + inkscape:zoom="1.0495049" + inkscape:cx="454.50003" + inkscape:cy="387.00003" + inkscape:window-x="833" + inkscape:window-y="28" + inkscape:window-maximized="0" + inkscape:current-layer="g4" /> + <g + style="stroke-width:.025in; fill:none" + id="g4"> + <!-- Line: box --> + <rect + x="450" + y="0" + width="6300" + height="7350" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect6" /> + <!-- Line: box --> + <rect + x="4950" + y="4950" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect8" /> + <!-- Line: box --> + <rect + x="750" + y="600" + width="5700" + height="3750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect10" /> + <!-- Line --> + <polyline + points="5250,8100 5688,5912 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline12" /> + <!-- Arrowhead on XXXpoint 5250 8100 - 5710 5790--> + <polyline + points="5714 6068 5704 5822 5598 6044 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline14" /> + <!-- Line --> + <polyline + points="4050,9300 4486,7262 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline16" /> + <!-- Arrowhead on XXXpoint 4050 9300 - 4512 7140--> + <polyline + points="4514 7418 4506 7172 4396 7394 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline18" /> + <!-- Line --> + <polyline + points="1040,9300 1476,7262 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline20" /> + <!-- Arrowhead on XXXpoint 1040 9300 - 1502 7140--> + <polyline + points="1504 7418 1496 7172 1386 7394 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline22" /> + <!-- Line --> + <polyline + points="2240,8100 2676,6062 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline24" /> + <!-- Arrowhead on XXXpoint 2240 8100 - 2702 5940--> + <polyline + points="2704 6218 2696 5972 2586 6194 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline26" /> + <!-- Line: box --> + <rect + x="0" + y="450" + width="6300" + height="7350" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect28" /> + <!-- Line: box --> + <rect + x="300" + y="1050" + width="5700" + height="3750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect30" /> + <!-- Line --> + <polyline + points="1350,3450 2350,2590 " + style="stroke:#00d1d1;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline32" /> + <!-- Arrowhead on XXXpoint 1350 3450 - 2444 2510--> + <!-- Line --> + <polyline + points="4950,3450 3948,2590 " + style="stroke:#00d1d1;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline36" /> + <!-- Arrowhead on XXXpoint 4950 3450 - 3854 2510--> + <!-- Line --> + <polyline + points="4050,6600 4050,4414 " + style="stroke:#00d1d1;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline40" /> + <!-- Arrowhead on XXXpoint 4050 6600 - 4050 4290--> + <!-- Line --> + <polyline + points="1050,6600 1050,4414 " + style="stroke:#00d1d1;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline44" /> + <!-- Arrowhead on XXXpoint 1050 6600 - 1050 4290--> + <!-- Line --> + <polyline + points="2250,5400 2250,4414 " + style="stroke:#00d1d1;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline48" /> + <!-- Arrowhead on XXXpoint 2250 5400 - 2250 4290--> + <!-- Line --> + <polyline + points="2250,8100 2250,6364 " + style="stroke:#00ff00;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline52" /> + <!-- Arrowhead on XXXpoint 2250 8100 - 2250 6240--> + <!-- Line --> + <polyline + points="1050,9300 1050,7564 " + style="stroke:#00ff00;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline56" /> + <!-- Arrowhead on XXXpoint 1050 9300 - 1050 7440--> + <!-- Line --> + <polyline + points="4050,9300 4050,7564 " + style="stroke:#00ff00;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline60" /> + <!-- Arrowhead on XXXpoint 4050 9300 - 4050 7440--> + <!-- Line --> + <polyline + points="5250,8100 5250,6364 " + style="stroke:#00ff00;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline64" /> + <!-- Arrowhead on XXXpoint 5250 8100 - 5250 6240--> + <!-- Circle --> + <circle + cx="2850" + cy="3900" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle68" /> + <!-- Circle --> + <circle + cx="3150" + cy="3900" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle70" /> + <!-- Circle --> + <circle + cx="3450" + cy="3900" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle72" /> + <!-- Circle --> + <circle + cx="1350" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle74" /> + <!-- Circle --> + <circle + cx="1650" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle76" /> + <!-- Circle --> + <circle + cx="1950" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle78" /> + <!-- Circle --> + <circle + cx="4350" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle80" /> + <!-- Circle --> + <circle + cx="4650" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle82" /> + <!-- Circle --> + <circle + cx="4950" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle84" /> + <!-- Line: box --> + <rect + x="750" + y="3450" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect86" /> + <!-- Line: box --> + <rect + x="300" + y="6600" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect88" /> + <!-- Line: box --> + <rect + x="4500" + y="5400" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect90" /> + <!-- Line: box --> + <rect + x="3300" + y="6600" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect92" /> + <!-- Line: box --> + <rect + x="2250" + y="1650" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect94" /> + <!-- Line: box --> + <rect + x="0" + y="9300" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect96" /> + <!-- Line: box --> + <rect + x="1350" + y="8100" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect98" /> + <!-- Line: box --> + <rect + x="3000" + y="9300" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect100" /> + <!-- Line: box --> + <rect + x="4350" + y="8100" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect102" /> + <!-- Line: box --> + <rect + x="1500" + y="5400" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect104" /> + <!-- Line --> + <polygon + points="5550,3450 7350,2850 7350,5100 5550,4350 5550,3450 " + style="stroke:#000000;stroke-width:14; stroke-linejoin:miter; stroke-linecap:butt; stroke-dasharray:120 120;fill:#ffbfbf; " + id="polygon106" /> + <!-- Line --> + <polyline + points="9300,3150 10734,3150 " + style="stroke:#000000;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline108" /> + <!-- Arrowhead on XXXpoint 9300 3150 - 10860 3150--> + <!-- Line: box --> + <rect + x="10800" + y="2850" + width="1200" + height="750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect112" /> + <!-- Line --> + <polyline + points="11400,3600 11400,4284 " + style="stroke:#000000;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline114" /> + <!-- Arrowhead on XXXpoint 11400 3600 - 11400 4410--> + <!-- Line: box --> + <rect + x="10800" + y="4350" + width="1200" + height="750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect118" /> + <!-- Line --> + <polyline + points="11400,5100 11400,5784 " + style="stroke:#000000;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline120" /> + <!-- Arrowhead on XXXpoint 11400 5100 - 11400 5910--> + <!-- Line: box --> + <rect + x="10800" + y="5850" + width="1200" + height="750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect124" /> + <!-- Line --> + <polyline + points="9300,3900 9900,3900 9900,4650 10734,4650 " + style="stroke:#000000;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline126" /> + <!-- Arrowhead on XXXpoint 9900 4650 - 10860 4650--> + <!-- Line --> + <polyline + points="9300,4650 9600,4650 9600,6150 10734,6150 " + style="stroke:#000000;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline130" /> + <!-- Arrowhead on XXXpoint 9600 6150 - 10860 6150--> + <!-- Text --> + <text + xml:space="preserve" + x="6450" + y="300" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="192" + text-anchor="end" + id="text134">rcu_bh</text> + <!-- Text --> + <text + xml:space="preserve" + x="3150" + y="1950" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text136">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="3150" + y="2250" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text138">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="1650" + y="3750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text140">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1650" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text142">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="2250" + y="5700" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text144">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2250" + y="6000" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text146">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="6900" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text148">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="7200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text150">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="5250" + y="5700" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text152">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5250" + y="6000" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text154">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="6900" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text156">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="7200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text158">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="450" + y="1350" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="start" + id="text160">struct rcu_state</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="9600" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text162">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="9900" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text164">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="9600" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text166">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="9900" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text168">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="2400" + y="8400" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text170">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2400" + y="8700" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text172">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="5400" + y="8400" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text174">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5400" + y="8700" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text176">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="6000" + y="750" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="192" + text-anchor="end" + id="text178">rcu_sched</text> + <!-- Text --> + <text + xml:space="preserve" + x="11400" + y="3300" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="216" + text-anchor="middle" + id="text180">T3</text> + <!-- Text --> + <text + xml:space="preserve" + x="11400" + y="4800" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="216" + text-anchor="middle" + id="text182">T2</text> + <!-- Text --> + <text + xml:space="preserve" + x="11400" + y="6300" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="216" + text-anchor="middle" + id="text184">T1</text> + <!-- Line --> + <polyline + points="5250,5400 5250,4414 " + style="stroke:#00d1d1;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline186" /> + <!-- Arrowhead on XXXpoint 5250 5400 - 5250 4290--> + <!-- Line: box --> + <rect + x="3750" + y="3450" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect190" /> + <!-- Line: box --> + <rect + x="7350" + y="2850" + width="1950" + height="750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect192" /> + <!-- Line: box --> + <rect + x="7350" + y="3600" + width="1950" + height="750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect194" /> + <!-- Line: box --> + <rect + x="7350" + y="4350" + width="1950" + height="750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect196" /> + <!-- Text --> + <text + xml:space="preserve" + x="4650" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text198">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="4650" + y="3750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text200">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="7500" + y="3300" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="start" + id="text202">blkd_tasks</text> + <!-- Text --> + <text + xml:space="preserve" + x="7500" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="start" + id="text204">gp_tasks</text> + <!-- Text --> + <text + xml:space="preserve" + x="7500" + y="4800" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="start" + id="text206">exp_tasks</text> + </g> +</svg> diff --git a/Documentation/RCU/Design/Data-Structures/nxtlist.svg b/Documentation/RCU/Design/Data-Structures/nxtlist.svg new file mode 100644 index 000000000000..abc4cc73a097 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/nxtlist.svg @@ -0,0 +1,396 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Creator: fig2dev Version 3.2 Patchlevel 5e --> + +<!-- CreationDate: Wed Dec 9 17:39:46 2015 --> + +<!-- Magnification: 3.000 --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="10.4in" + height="10.4in" + viewBox="-66 -66 12507 12507" + id="svg2" + version="1.1" + inkscape:version="0.48.4 r9939" + sodipodi:docname="nxtlist.fig"> + <metadata + id="metadata94"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title></dc:title> + </cc:Work> + </rdf:RDF> + </metadata> + <defs + id="defs92"> + <marker + inkscape:stockid="Arrow1Mend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow1Mend" + style="overflow:visible;"> + <path + id="path3852" + d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z " + style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;" + transform="scale(0.4) rotate(180) translate(10,0)" /> + </marker> + </defs> + <sodipodi:namedview + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1" + objecttolerance="10" + gridtolerance="10" + guidetolerance="10" + inkscape:pageopacity="0" + inkscape:pageshadow="2" + inkscape:window-width="925" + inkscape:window-height="928" + id="namedview90" + showgrid="false" + inkscape:zoom="0.80021373" + inkscape:cx="467.99997" + inkscape:cy="467.99997" + inkscape:window-x="948" + inkscape:window-y="73" + inkscape:window-maximized="0" + inkscape:current-layer="g4" /> + <g + style="stroke-width:.025in; fill:none" + id="g4"> + <!-- Line: box --> + <rect + x="0" + y="0" + width="7875" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect6" /> + <!-- Line: box --> + <rect + x="0" + y="1125" + width="7875" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect8" /> + <!-- Line: box --> + <rect + x="0" + y="2250" + width="7875" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect10" /> + <!-- Line: box --> + <rect + x="0" + y="3375" + width="7875" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect12" /> + <!-- Line: box --> + <rect + x="0" + y="4500" + width="7875" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect14" /> + <!-- Line: box --> + <rect + x="10575" + y="0" + width="1800" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect16" /> + <!-- Line: box --> + <rect + x="10575" + y="1125" + width="1800" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect18" /> + <!-- Line --> + <polyline + points="11475,2250 11475,3276 " + style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline20" /> + <!-- Arrowhead on XXXpoint 11475 2250 - 11475 3465--> + <!-- Line: box --> + <rect + x="10575" + y="6750" + width="1800" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect24" /> + <!-- Line: box --> + <rect + x="10575" + y="7875" + width="1800" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect26" /> + <!-- Line: box --> + <rect + x="10575" + y="10125" + width="1800" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect28" /> + <!-- Line: box --> + <rect + x="10575" + y="11250" + width="1800" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect30" /> + <!-- Line: box --> + <rect + x="10575" + y="3375" + width="1800" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect32" /> + <!-- Line --> + <polyline + points="11475,5625 11475,6651 " + style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline34" /> + <!-- Arrowhead on XXXpoint 11475 5625 - 11475 6840--> + <!-- Line --> + <polyline + points="7875,225 10476,225 " + style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline38" /> + <!-- Arrowhead on XXXpoint 7875 225 - 10665 225--> + <!-- Line --> + <polyline + points="7875,1350 9675,1350 9675,675 7971,675 " + style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline42" /> + <!-- Arrowhead on XXXpoint 9675 675 - 7785 675--> + <!-- Line --> + <polyline + points="7875,2475 9675,2475 9675,4725 10476,4725 " + style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline46" /> + <!-- Arrowhead on XXXpoint 9675 4725 - 10665 4725--> + <!-- Line --> + <polyline + points="7875,3600 9225,3600 9225,5175 10476,5175 " + style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline50" /> + <!-- Arrowhead on XXXpoint 9225 5175 - 10665 5175--> + <!-- Line --> + <polyline + points="7875,4725 8775,4725 8775,11475 10476,11475 " + style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline54" /> + <!-- Arrowhead on XXXpoint 8775 11475 - 10665 11475--> + <!-- Line: box --> + <rect + x="10575" + y="4500" + width="1800" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect58" /> + <!-- Line --> + <polyline + points="11475,9000 11475,10026 " + style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline60" /> + <!-- Arrowhead on XXXpoint 11475 9000 - 11475 10215--> + <!-- Text --> + <text + xml:space="preserve" + x="225" + y="675" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="324" + text-anchor="start" + id="text64">nxtlist</text> + <!-- Text --> + <text + xml:space="preserve" + x="225" + y="1800" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="324" + text-anchor="start" + id="text66">nxttail[RCU_DONE_TAIL]</text> + <!-- Text --> + <text + xml:space="preserve" + x="225" + y="2925" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="324" + text-anchor="start" + id="text68">nxttail[RCU_WAIT_TAIL]</text> + <!-- Text --> + <text + xml:space="preserve" + x="225" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="324" + text-anchor="start" + id="text70">nxttail[RCU_NEXT_READY_TAIL]</text> + <!-- Text --> + <text + xml:space="preserve" + x="225" + y="5175" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="324" + text-anchor="start" + id="text72">nxttail[RCU_NEXT_TAIL]</text> + <!-- Text --> + <text + xml:space="preserve" + x="11475" + y="675" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text74">CB 1</text> + <!-- Text --> + <text + xml:space="preserve" + x="11475" + y="1800" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text76">next</text> + <!-- Text --> + <text + xml:space="preserve" + x="11475" + y="7425" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text78">CB 3</text> + <!-- Text --> + <text + xml:space="preserve" + x="11475" + y="8550" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text80">next</text> + <!-- Text --> + <text + xml:space="preserve" + x="11475" + y="10800" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text82">CB 4</text> + <!-- Text --> + <text + xml:space="preserve" + x="11475" + y="11925" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text84">next</text> + <!-- Text --> + <text + xml:space="preserve" + x="11475" + y="4050" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text86">CB 2</text> + <!-- Text --> + <text + xml:space="preserve" + x="11475" + y="5175" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text88">next</text> + </g> +</svg> diff --git a/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png b/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png Binary files differdeleted file mode 100644 index 7496a55e4e7b..000000000000 --- a/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png +++ /dev/null diff --git a/Documentation/RCU/Design/Requirements/RCUApplicability.svg b/Documentation/RCU/Design/Requirements/RCUApplicability.svg deleted file mode 100644 index ebcbeee391ed..000000000000 --- a/Documentation/RCU/Design/Requirements/RCUApplicability.svg +++ /dev/null @@ -1,237 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" standalone="no"?> -<!-- Creator: fig2dev Version 3.2 Patchlevel 5d --> - -<!-- CreationDate: Tue Mar 4 18:34:25 2014 --> - -<!-- Magnification: 3.000 --> - -<svg - xmlns:dc="http://purl.org/dc/elements/1.1/" - xmlns:cc="http://creativecommons.org/ns#" - xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" - xmlns:svg="http://www.w3.org/2000/svg" - xmlns="http://www.w3.org/2000/svg" - xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" - xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" - width="1089.1382" - height="668.21368" - viewBox="-2121 -36 14554.634 8876.4061" - id="svg2" - version="1.1" - inkscape:version="0.48.3.1 r9886" - sodipodi:docname="RCUApplicability.svg"> - <metadata - id="metadata40"> - <rdf:RDF> - <cc:Work - rdf:about=""> - <dc:format>image/svg+xml</dc:format> - <dc:type - rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> - <dc:title /> - </cc:Work> - </rdf:RDF> - </metadata> - <defs - id="defs38" /> - <sodipodi:namedview - pagecolor="#ffffff" - bordercolor="#666666" - borderopacity="1" - objecttolerance="10" - gridtolerance="10" - guidetolerance="10" - inkscape:pageopacity="0" - inkscape:pageshadow="2" - inkscape:window-width="849" - inkscape:window-height="639" - id="namedview36" - showgrid="false" - inkscape:zoom="0.51326165" - inkscape:cx="544.56912" - inkscape:cy="334.10686" - inkscape:window-x="149" - inkscape:window-y="448" - inkscape:window-maximized="0" - inkscape:current-layer="g4" - fit-margin-top="5" - fit-margin-left="5" - fit-margin-right="5" - fit-margin-bottom="5" /> - <g - style="fill:none;stroke-width:0.025in" - id="g4" - transform="translate(-2043.6828,14.791398)"> - <!-- Line: box --> - <rect - x="0" - y="0" - width="14400" - height="8775" - rx="0" - style="fill:#ffa1a1;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter" - id="rect6" /> - <!-- Line: box --> - <rect - x="1350" - y="0" - width="11700" - height="6075" - rx="0" - style="fill:#ffff00;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter" - id="rect8" /> - <!-- Line: box --> - <rect - x="2700" - y="0" - width="9000" - height="4275" - rx="0" - style="fill:#00ff00;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter" - id="rect10" /> - <!-- Line: box --> - <rect - x="4050" - y="0" - width="6300" - height="2475" - rx="0" - style="fill:#87cfff;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter" - id="rect12" /> - <!-- Text --> - <text - xml:space="preserve" - x="7200" - y="900" - font-style="normal" - font-weight="normal" - font-size="324" - id="text14" - sodipodi:linespacing="125%" - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" - id="tspan3017">Read-Mostly, Stale &</tspan></text> - <!-- Text --> - <text - xml:space="preserve" - x="7200" - y="1350" - font-style="normal" - font-weight="normal" - font-size="324" - id="text16" - sodipodi:linespacing="125%" - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" - id="tspan3019">Inconsistent Data OK</tspan></text> - <!-- Text --> - <text - xml:space="preserve" - x="7200" - y="1800" - font-style="normal" - font-weight="normal" - font-size="324" - id="text18" - sodipodi:linespacing="125%" - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" - id="tspan3021">(RCU Works Great!!!)</tspan></text> - <!-- Text --> - <text - xml:space="preserve" - x="7200" - y="3825" - font-style="normal" - font-weight="normal" - font-size="324" - id="text20" - sodipodi:linespacing="125%" - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" - id="tspan3023">(RCU Works Well)</tspan></text> - <!-- Text --> - <text - xml:space="preserve" - x="7200" - y="3375" - font-style="normal" - font-weight="normal" - font-size="324" - id="text22" - sodipodi:linespacing="125%" - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" - id="tspan3025">Read-Mostly, Need Consistent Data</tspan></text> - <!-- Text --> - <text - xml:space="preserve" - x="7200" - y="5175" - font-style="normal" - font-weight="normal" - font-size="324" - id="text24" - sodipodi:linespacing="125%" - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" - id="tspan3027">Read-Write, Need Consistent Data</tspan></text> - <!-- Text --> - <text - xml:space="preserve" - x="7200" - y="6975" - font-style="normal" - font-weight="normal" - font-size="324" - id="text26" - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" - sodipodi:linespacing="125%">Update-Mostly, Need Consistent Data</text> - <!-- Text --> - <text - xml:space="preserve" - x="7200" - y="5625" - font-style="normal" - font-weight="normal" - font-size="324" - id="text28" - sodipodi:linespacing="125%" - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" - id="tspan3029">(RCU Might Be OK...)</tspan></text> - <!-- Text --> - <text - xml:space="preserve" - x="7200" - y="7875" - font-style="normal" - font-weight="normal" - font-size="324" - id="text30" - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" - sodipodi:linespacing="125%">(1) Provide Existence Guarantees For Update-Friendly Mechanisms</text> - <!-- Text --> - <text - xml:space="preserve" - x="7200" - y="8325" - font-style="normal" - font-weight="normal" - font-size="324" - id="text32" - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" - sodipodi:linespacing="125%">(2) Provide Wait-Free Read-Side Primitives for Real-Time Use)</text> - <!-- Text --> - <text - xml:space="preserve" - x="7200" - y="7425" - font-style="normal" - font-weight="normal" - font-size="324" - id="text34" - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" - sodipodi:linespacing="125%">(RCU is Very Unlikely to be the Right Tool For The Job, But it Can:</text> - </g> -</svg> diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index a725f9900ec8..e7e24b3e86e2 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -1,5 +1,3 @@ -<!-- DO NOT HAND EDIT. --> -<!-- Instead, edit Documentation/RCU/Design/Requirements/Requirements.htmlx and run 'sh htmlqqz.sh Documentation/RCU/Design/Requirements/Requirements' --> <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> @@ -65,8 +63,8 @@ All that aside, here are the categories of currently known RCU requirements: <p> This is followed by a <a href="#Summary">summary</a>, -which is in turn followed by the inevitable -<a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>. +however, the answers to each quick quiz immediately follows the quiz. +Select the big white space with your mouse to see the answer. <h2><a name="Fundamental Requirements">Fundamental Requirements</a></h2> @@ -153,13 +151,27 @@ Therefore, the outcome: </blockquote> cannot happen. -<p><a name="Quick Quiz 1"><b>Quick Quiz 1</b>:</a> -Wait a minute! -You said that updaters can make useful forward progress concurrently -with readers, but pre-existing readers will block -<tt>synchronize_rcu()</tt>!!! -Just who are you trying to fool??? -<br><a href="#qq1answer">Answer</a> +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + Wait a minute! + You said that updaters can make useful forward progress concurrently + with readers, but pre-existing readers will block + <tt>synchronize_rcu()</tt>!!! + Just who are you trying to fool??? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + First, if updaters do not wish to be blocked by readers, they can use + <tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will + be discussed later. + Second, even when using <tt>synchronize_rcu()</tt>, the other + update-side code does run concurrently with readers, whether + pre-existing or not. +</font></td></tr> +<tr><td> </td></tr> +</table> <p> This scenario resembles one of the first uses of RCU in @@ -210,9 +222,20 @@ to guarantee that <tt>do_something()</tt> never runs concurrently with <tt>recovery()</tt>, but with little or no synchronization overhead in <tt>do_something_dlm()</tt>. -<p><a name="Quick Quiz 2"><b>Quick Quiz 2</b>:</a> -Why is the <tt>synchronize_rcu()</tt> on line 28 needed? -<br><a href="#qq2answer">Answer</a> +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + Why is the <tt>synchronize_rcu()</tt> on line 28 needed? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + Without that extra grace period, memory reordering could result in + <tt>do_something_dlm()</tt> executing <tt>do_something()</tt> + concurrently with the last bits of <tt>recovery()</tt>. +</font></td></tr> +<tr><td> </td></tr> +</table> <p> In order to avoid fatal problems such as deadlocks, @@ -332,12 +355,27 @@ It also prevents any number of “interesting” compiler optimizations, for example, the use of <tt>gp</tt> as a scratch location immediately preceding the assignment. -<p><a name="Quick Quiz 3"><b>Quick Quiz 3</b>:</a> -But <tt>rcu_assign_pointer()</tt> does nothing to prevent the -two assignments to <tt>p->a</tt> and <tt>p->b</tt> -from being reordered. -Can't that also cause problems? -<br><a href="#qq3answer">Answer</a> +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + But <tt>rcu_assign_pointer()</tt> does nothing to prevent the + two assignments to <tt>p->a</tt> and <tt>p->b</tt> + from being reordered. + Can't that also cause problems? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + No, it cannot. + The readers cannot see either of these two fields until + the assignment to <tt>gp</tt>, by which time both fields are + fully initialized. + So reordering the assignments + to <tt>p->a</tt> and <tt>p->b</tt> cannot possibly + cause any problems. +</font></td></tr> +<tr><td> </td></tr> +</table> <p> It is tempting to assume that the reader need not do anything special @@ -494,11 +532,42 @@ The <tt>rcu_access_pointer()</tt> on line 6 is similar to code protected by the corresponding update-side lock. </ol> -<p><a name="Quick Quiz 4"><b>Quick Quiz 4</b>:</a> -Without the <tt>rcu_dereference()</tt> or the -<tt>rcu_access_pointer()</tt>, what destructive optimizations -might the compiler make use of? -<br><a href="#qq4answer">Answer</a> +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + Without the <tt>rcu_dereference()</tt> or the + <tt>rcu_access_pointer()</tt>, what destructive optimizations + might the compiler make use of? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + Let's start with what happens to <tt>do_something_gp()</tt> + if it fails to use <tt>rcu_dereference()</tt>. + It could reuse a value formerly fetched from this same pointer. + It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time + manner, resulting in <i>load tearing</i>, in turn resulting a bytewise + mash-up of two distince pointer values. + It might even use value-speculation optimizations, where it makes + a wrong guess, but by the time it gets around to checking the + value, an update has changed the pointer to match the wrong guess. + Too bad about any dereferences that returned pre-initialization garbage + in the meantime! + </font> + + <p><font color="ffffff"> + For <tt>remove_gp_synchronous()</tt>, as long as all modifications + to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>, + the above optimizations are harmless. + However, + with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>, + <tt>sparse</tt> will complain if you + define <tt>gp</tt> with <tt>__rcu</tt> and then + access it without using + either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>. +</font></td></tr> +<tr><td> </td></tr> +</table> <p> In short, RCU's publish-subscribe guarantee is provided by the combination @@ -571,17 +640,156 @@ systems with more than one CPU: <tt>synchronize_rcu()</tt> migrates in the meantime. </ol> -<p><a name="Quick Quiz 5"><b>Quick Quiz 5</b>:</a> -Given that multiple CPUs can start RCU read-side critical sections -at any time without any ordering whatsoever, how can RCU possibly tell whether -or not a given RCU read-side critical section starts before a -given instance of <tt>synchronize_rcu()</tt>? -<br><a href="#qq5answer">Answer</a> - -<p><a name="Quick Quiz 6"><b>Quick Quiz 6</b>:</a> -The first and second guarantees require unbelievably strict ordering! -Are all these memory barriers <i> really</i> required? -<br><a href="#qq6answer">Answer</a> +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + Given that multiple CPUs can start RCU read-side critical sections + at any time without any ordering whatsoever, how can RCU possibly + tell whether or not a given RCU read-side critical section starts + before a given instance of <tt>synchronize_rcu()</tt>? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + If RCU cannot tell whether or not a given + RCU read-side critical section starts before a + given instance of <tt>synchronize_rcu()</tt>, + then it must assume that the RCU read-side critical section + started first. + In other words, a given instance of <tt>synchronize_rcu()</tt> + can avoid waiting on a given RCU read-side critical section only + if it can prove that <tt>synchronize_rcu()</tt> started first. +</font></td></tr> +<tr><td> </td></tr> +</table> + +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + The first and second guarantees require unbelievably strict ordering! + Are all these memory barriers <i> really</i> required? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + Yes, they really are required. + To see why the first guarantee is required, consider the following + sequence of events: + </font> + + <ol> + <li> <font color="ffffff"> + CPU 1: <tt>rcu_read_lock()</tt> + </font> + <li> <font color="ffffff"> + CPU 1: <tt>q = rcu_dereference(gp); + /* Very likely to return p. */</tt> + </font> + <li> <font color="ffffff"> + CPU 0: <tt>list_del_rcu(p);</tt> + </font> + <li> <font color="ffffff"> + CPU 0: <tt>synchronize_rcu()</tt> starts. + </font> + <li> <font color="ffffff"> + CPU 1: <tt>do_something_with(q->a); + /* No smp_mb(), so might happen after kfree(). */</tt> + </font> + <li> <font color="ffffff"> + CPU 1: <tt>rcu_read_unlock()</tt> + </font> + <li> <font color="ffffff"> + CPU 0: <tt>synchronize_rcu()</tt> returns. + </font> + <li> <font color="ffffff"> + CPU 0: <tt>kfree(p);</tt> + </font> + </ol> + + <p><font color="ffffff"> + Therefore, there absolutely must be a full memory barrier between the + end of the RCU read-side critical section and the end of the + grace period. + </font> + + <p><font color="ffffff"> + The sequence of events demonstrating the necessity of the second rule + is roughly similar: + </font> + + <ol> + <li> <font color="ffffff">CPU 0: <tt>list_del_rcu(p);</tt> + </font> + <li> <font color="ffffff">CPU 0: <tt>synchronize_rcu()</tt> starts. + </font> + <li> <font color="ffffff">CPU 1: <tt>rcu_read_lock()</tt> + </font> + <li> <font color="ffffff">CPU 1: <tt>q = rcu_dereference(gp); + /* Might return p if no memory barrier. */</tt> + </font> + <li> <font color="ffffff">CPU 0: <tt>synchronize_rcu()</tt> returns. + </font> + <li> <font color="ffffff">CPU 0: <tt>kfree(p);</tt> + </font> + <li> <font color="ffffff"> + CPU 1: <tt>do_something_with(q->a); /* Boom!!! */</tt> + </font> + <li> <font color="ffffff">CPU 1: <tt>rcu_read_unlock()</tt> + </font> + </ol> + + <p><font color="ffffff"> + And similarly, without a memory barrier between the beginning of the + grace period and the beginning of the RCU read-side critical section, + CPU 1 might end up accessing the freelist. + </font> + + <p><font color="ffffff"> + The “as if” rule of course applies, so that any + implementation that acts as if the appropriate memory barriers + were in place is a correct implementation. + That said, it is much easier to fool yourself into believing + that you have adhered to the as-if rule than it is to actually + adhere to it! +</font></td></tr> +<tr><td> </td></tr> +</table> + +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + You claim that <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt> + generate absolutely no code in some kernel builds. + This means that the compiler might arbitrarily rearrange consecutive + RCU read-side critical sections. + Given such rearrangement, if a given RCU read-side critical section + is done, how can you be sure that all prior RCU read-side critical + sections are done? + Won't the compiler rearrangements make that impossible to determine? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + In cases where <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt> + generate absolutely no code, RCU infers quiescent states only at + special locations, for example, within the scheduler. + Because calls to <tt>schedule()</tt> had better prevent calling-code + accesses to shared variables from being rearranged across the call to + <tt>schedule()</tt>, if RCU detects the end of a given RCU read-side + critical section, it will necessarily detect the end of all prior + RCU read-side critical sections, no matter how aggressively the + compiler scrambles the code. + </font> + + <p><font color="ffffff"> + Again, this all assumes that the compiler cannot scramble code across + calls to the scheduler, out of interrupt handlers, into the idle loop, + into user-mode code, and so on. + But if your kernel build allows that sort of scrambling, you have broken + far more than just RCU! +</font></td></tr> +<tr><td> </td></tr> +</table> <p> Note that these memory-barrier requirements do not replace the fundamental @@ -626,9 +834,19 @@ inconvenience can be avoided through use of the <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt> API members described later in this document. -<p><a name="Quick Quiz 7"><b>Quick Quiz 7</b>:</a> -But how does the upgrade-to-write operation exclude other readers? -<br><a href="#qq7answer">Answer</a> +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + But how does the upgrade-to-write operation exclude other readers? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + It doesn't, just like normal RCU updates, which also do not exclude + RCU readers. +</font></td></tr> +<tr><td> </td></tr> +</table> <p> This guarantee allows lookup code to be shared between read-side @@ -714,9 +932,20 @@ to do significant reordering. This is by design: Any significant ordering constraints would slow down these fast-path APIs. -<p><a name="Quick Quiz 8"><b>Quick Quiz 8</b>:</a> -Can't the compiler also reorder this code? -<br><a href="#qq8answer">Answer</a> +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + Can't the compiler also reorder this code? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + No, the volatile casts in <tt>READ_ONCE()</tt> and + <tt>WRITE_ONCE()</tt> prevent the compiler from reordering in + this particular case. +</font></td></tr> +<tr><td> </td></tr> +</table> <h3><a name="Readers Do Not Exclude Updaters">Readers Do Not Exclude Updaters</a></h3> @@ -769,10 +998,28 @@ new readers can start immediately after <tt>synchronize_rcu()</tt> starts, and <tt>synchronize_rcu()</tt> is under no obligation to wait for these new readers. -<p><a name="Quick Quiz 9"><b>Quick Quiz 9</b>:</a> -Suppose that synchronize_rcu() did wait until all readers had completed. -Would the updater be able to rely on this? -<br><a href="#qq9answer">Answer</a> +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + Suppose that synchronize_rcu() did wait until <i>all</i> + readers had completed instead of waiting only on + pre-existing readers. + For how long would the updater be able to rely on there + being no readers? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + For no time at all. + Even if <tt>synchronize_rcu()</tt> were to wait until + all readers had completed, a new reader might start immediately after + <tt>synchronize_rcu()</tt> completed. + Therefore, the code following + <tt>synchronize_rcu()</tt> can <i>never</i> rely on there being + no readers. +</font></td></tr> +<tr><td> </td></tr> +</table> <h3><a name="Grace Periods Don't Partition Read-Side Critical Sections"> Grace Periods Don't Partition Read-Side Critical Sections</a></h3> @@ -969,11 +1216,24 @@ grace period. As a result, an RCU read-side critical section cannot partition a pair of RCU grace periods. -<p><a name="Quick Quiz 10"><b>Quick Quiz 10</b>:</a> -How long a sequence of grace periods, each separated by an RCU read-side -critical section, would be required to partition the RCU read-side -critical sections at the beginning and end of the chain? -<br><a href="#qq10answer">Answer</a> +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + How long a sequence of grace periods, each separated by an RCU + read-side critical section, would be required to partition the RCU + read-side critical sections at the beginning and end of the chain? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + In theory, an infinite number. + In practice, an unknown number that is sensitive to both implementation + details and timing considerations. + Therefore, even in practice, RCU users must abide by the + theoretical rather than the practical answer. +</font></td></tr> +<tr><td> </td></tr> +</table> <h3><a name="Disabling Preemption Does Not Block Grace Periods"> Disabling Preemption Does Not Block Grace Periods</a></h3> @@ -1109,12 +1369,27 @@ These classes is covered in the following sections. <h3><a name="Specialization">Specialization</a></h3> <p> -RCU is and always has been intended primarily for read-mostly situations, as -illustrated by the following figure. -This means that RCU's read-side primitives are optimized, often at the +RCU is and always has been intended primarily for read-mostly situations, +which means that RCU's read-side primitives are optimized, often at the expense of its update-side primitives. +Experience thus far is captured by the following list of situations: -<p><img src="RCUApplicability.svg" alt="RCUApplicability.svg" width="70%"></p> +<ol> +<li> Read-mostly data, where stale and inconsistent data is not + a problem: RCU works great! +<li> Read-mostly data, where data must be consistent: + RCU works well. +<li> Read-write data, where data must be consistent: + RCU <i>might</i> work OK. + Or not. +<li> Write-mostly data, where data must be consistent: + RCU is very unlikely to be the right tool for the job, + with the following exceptions, where RCU can provide: + <ol type=a> + <li> Existence guarantees for update-friendly mechanisms. + <li> Wait-free read-side primitives for real-time use. + </ol> +</ol> <p> This focus on read-mostly situations means that RCU must interoperate @@ -1127,9 +1402,43 @@ synchronization primitives be legal within RCU read-side critical sections, including spinlocks, sequence locks, atomic operations, reference counters, and memory barriers. -<p><a name="Quick Quiz 11"><b>Quick Quiz 11</b>:</a> -What about sleeping locks? -<br><a href="#qq11answer">Answer</a> +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + What about sleeping locks? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + These are forbidden within Linux-kernel RCU read-side critical + sections because it is not legal to place a quiescent state + (in this case, voluntary context switch) within an RCU read-side + critical section. + However, sleeping locks may be used within userspace RCU read-side + critical sections, and also within Linux-kernel sleepable RCU + <a href="#Sleepable RCU"><font color="ffffff">(SRCU)</font></a> + read-side critical sections. + In addition, the -rt patchset turns spinlocks into a + sleeping locks so that the corresponding critical sections + can be preempted, which also means that these sleeplockified + spinlocks (but not other sleeping locks!) may be acquire within + -rt-Linux-kernel RCU read-side critical sections. + </font> + + <p><font color="ffffff"> + Note that it <i>is</i> legal for a normal RCU read-side + critical section to conditionally acquire a sleeping locks + (as in <tt>mutex_trylock()</tt>), but only as long as it does + not loop indefinitely attempting to conditionally acquire that + sleeping locks. + The key point is that things like <tt>mutex_trylock()</tt> + either return with the mutex held, or return an error indication if + the mutex was not immediately available. + Either way, <tt>mutex_trylock()</tt> returns immediately without + sleeping. +</font></td></tr> +<tr><td> </td></tr> +</table> <p> It often comes as a surprise that many algorithms do not require a @@ -1160,10 +1469,7 @@ some period of time, so the exact wait period is a judgment call. One of our pair of veternarians might wait 30 seconds before pronouncing the cat dead, while the other might insist on waiting a full minute. The two veternarians would then disagree on the state of the cat during -the final 30 seconds of the minute following the last heartbeat, as -fancifully illustrated below: - -<p><img src="2013-08-is-it-dead.png" alt="2013-08-is-it-dead.png" width="431"></p> +the final 30 seconds of the minute following the last heartbeat. <p> Interestingly enough, this same situation applies to hardware. @@ -1343,7 +1649,8 @@ situations where neither <tt>synchronize_rcu()</tt> nor <tt>synchronize_rcu_expedited()</tt> would be legal, including within preempt-disable code, <tt>local_bh_disable()</tt> code, interrupt-disable code, and interrupt handlers. -However, even <tt>call_rcu()</tt> is illegal within NMI handlers. +However, even <tt>call_rcu()</tt> is illegal within NMI handlers +and from idle and offline CPUs. The callback function (<tt>remove_gp_cb()</tt> in this case) will be executed within softirq (software interrupt) environment within the Linux kernel, @@ -1354,12 +1661,27 @@ write an RCU callback function that takes too long. Long-running operations should be relegated to separate threads or (in the Linux kernel) workqueues. -<p><a name="Quick Quiz 12"><b>Quick Quiz 12</b>:</a> -Why does line 19 use <tt>rcu_access_pointer()</tt>? -After all, <tt>call_rcu()</tt> on line 25 stores into the -structure, which would interact badly with concurrent insertions. -Doesn't this mean that <tt>rcu_dereference()</tt> is required? -<br><a href="#qq12answer">Answer</a> +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + Why does line 19 use <tt>rcu_access_pointer()</tt>? + After all, <tt>call_rcu()</tt> on line 25 stores into the + structure, which would interact badly with concurrent insertions. + Doesn't this mean that <tt>rcu_dereference()</tt> is required? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + Presumably the <tt>->gp_lock</tt> acquired on line 18 excludes + any changes, including any insertions that <tt>rcu_dereference()</tt> + would protect against. + Therefore, any insertions will be delayed until after + <tt>->gp_lock</tt> + is released on line 25, which in turn means that + <tt>rcu_access_pointer()</tt> suffices. +</font></td></tr> +<tr><td> </td></tr> +</table> <p> However, all that <tt>remove_gp_cb()</tt> is doing is @@ -1406,14 +1728,31 @@ This was due to the fact that RCU was not heavily used within DYNIX/ptx, so the very few places that needed something like <tt>synchronize_rcu()</tt> simply open-coded it. -<p><a name="Quick Quiz 13"><b>Quick Quiz 13</b>:</a> -Earlier it was claimed that <tt>call_rcu()</tt> and -<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked -by readers. -But how can that be correct, given that the invocation of the callback -and the freeing of the memory (respectively) must still wait for -a grace period to elapse? -<br><a href="#qq13answer">Answer</a> +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + Earlier it was claimed that <tt>call_rcu()</tt> and + <tt>kfree_rcu()</tt> allowed updaters to avoid being blocked + by readers. + But how can that be correct, given that the invocation of the callback + and the freeing of the memory (respectively) must still wait for + a grace period to elapse? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + We could define things this way, but keep in mind that this sort of + definition would say that updates in garbage-collected languages + cannot complete until the next time the garbage collector runs, + which does not seem at all reasonable. + The key point is that in most cases, an updater using either + <tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the + next update as soon as it has invoked <tt>call_rcu()</tt> or + <tt>kfree_rcu()</tt>, without having to wait for a subsequent + grace period. +</font></td></tr> +<tr><td> </td></tr> +</table> <p> But what if the updater must wait for the completion of code to be @@ -1838,11 +2177,26 @@ kthreads to be spawned. Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler initialization can result in deadlock. -<p><a name="Quick Quiz 14"><b>Quick Quiz 14</b>:</a> -So what happens with <tt>synchronize_rcu()</tt> during -scheduler initialization for <tt>CONFIG_PREEMPT=n</tt> -kernels? -<br><a href="#qq14answer">Answer</a> +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + So what happens with <tt>synchronize_rcu()</tt> during + scheduler initialization for <tt>CONFIG_PREEMPT=n</tt> + kernels? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt> + maps directly to <tt>synchronize_sched()</tt>. + Therefore, <tt>synchronize_rcu()</tt> works normally throughout + boot in <tt>CONFIG_PREEMPT=n</tt> kernels. + However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels, + so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt> + during scheduler initialization. +</font></td></tr> +<tr><td> </td></tr> +</table> <p> I learned of these boot-time requirements as a result of a series of @@ -2171,6 +2525,14 @@ This real-time requirement motivated the grace-period kthread, which also simplified handling of a number of race conditions. <p> +RCU must avoid degrading real-time response for CPU-bound threads, whether +executing in usermode (which is one use case for +<tt>CONFIG_NO_HZ_FULL=y</tt>) or in the kernel. +That said, CPU-bound loops in the kernel must execute +<tt>cond_resched_rcu_qs()</tt> at least once per few tens of milliseconds +in order to avoid receiving an IPI from RCU. + +<p> Finally, RCU's status as a synchronization primitive means that any RCU failure can result in arbitrary memory corruption that can be extremely difficult to debug. @@ -2223,6 +2585,8 @@ described in a separate section. <li> <a href="#Sched Flavor">Sched Flavor</a> <li> <a href="#Sleepable RCU">Sleepable RCU</a> <li> <a href="#Tasks RCU">Tasks RCU</a> +<li> <a href="#Waiting for Multiple Grace Periods"> + Waiting for Multiple Grace Periods</a> </ol> <h3><a name="Bottom-Half Flavor">Bottom-Half Flavor</a></h3> @@ -2472,6 +2836,94 @@ The tasks-RCU API is quite compact, consisting only of <tt>synchronize_rcu_tasks()</tt>, and <tt>rcu_barrier_tasks()</tt>. +<h3><a name="Waiting for Multiple Grace Periods"> +Waiting for Multiple Grace Periods</a></h3> + +<p> +Perhaps you have an RCU protected data structure that is accessed from +RCU read-side critical sections, from softirq handlers, and from +hardware interrupt handlers. +That is three flavors of RCU, the normal flavor, the bottom-half flavor, +and the sched flavor. +How to wait for a compound grace period? + +<p> +The best approach is usually to “just say no!” and +insert <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt> +around each RCU read-side critical section, regardless of what +environment it happens to be in. +But suppose that some of the RCU read-side critical sections are +on extremely hot code paths, and that use of <tt>CONFIG_PREEMPT=n</tt> +is not a viable option, so that <tt>rcu_read_lock()</tt> and +<tt>rcu_read_unlock()</tt> are not free. +What then? + +<p> +You <i>could</i> wait on all three grace periods in succession, as follows: + +<blockquote> +<pre> + 1 synchronize_rcu(); + 2 synchronize_rcu_bh(); + 3 synchronize_sched(); +</pre> +</blockquote> + +<p> +This works, but triples the update-side latency penalty. +In cases where this is not acceptable, <tt>synchronize_rcu_mult()</tt> +may be used to wait on all three flavors of grace period concurrently: + +<blockquote> +<pre> + 1 synchronize_rcu_mult(call_rcu, call_rcu_bh, call_rcu_sched); +</pre> +</blockquote> + +<p> +But what if it is necessary to also wait on SRCU? +This can be done as follows: + +<blockquote> +<pre> + 1 static void call_my_srcu(struct rcu_head *head, + 2 void (*func)(struct rcu_head *head)) + 3 { + 4 call_srcu(&my_srcu, head, func); + 5 } + 6 + 7 synchronize_rcu_mult(call_rcu, call_rcu_bh, call_rcu_sched, call_my_srcu); +</pre> +</blockquote> + +<p> +If you needed to wait on multiple different flavors of SRCU +(but why???), you would need to create a wrapper function resembling +<tt>call_my_srcu()</tt> for each SRCU flavor. + +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + But what if I need to wait for multiple RCU flavors, but I also need + the grace periods to be expedited? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + If you are using expedited grace periods, there should be less penalty + for waiting on them in succession. + But if that is nevertheless a problem, you can use workqueues + or multiple kthreads to wait on the various expedited grace + periods concurrently. +</font></td></tr> +<tr><td> </td></tr> +</table> + +<p> +Again, it is usually better to adjust the RCU read-side critical sections +to use a single flavor of RCU, but when this is not feasible, you can use +<tt>synchronize_rcu_mult()</tt>. + <h2><a name="Possible Future Changes">Possible Future Changes</a></h2> <p> @@ -2569,329 +3021,4 @@ and is provided under the terms of the Creative Commons Attribution-Share Alike 3.0 United States license. -<h3><a name="Answers to Quick Quizzes"> -Answers to Quick Quizzes</a></h3> - -<a name="qq1answer"></a> -<p><b>Quick Quiz 1</b>: -Wait a minute! -You said that updaters can make useful forward progress concurrently -with readers, but pre-existing readers will block -<tt>synchronize_rcu()</tt>!!! -Just who are you trying to fool??? - - -</p><p><b>Answer</b>: -First, if updaters do not wish to be blocked by readers, they can use -<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will -be discussed later. -Second, even when using <tt>synchronize_rcu()</tt>, the other -update-side code does run concurrently with readers, whether pre-existing -or not. - - -</p><p><a href="#Quick%20Quiz%201"><b>Back to Quick Quiz 1</b>.</a> - -<a name="qq2answer"></a> -<p><b>Quick Quiz 2</b>: -Why is the <tt>synchronize_rcu()</tt> on line 28 needed? - - -</p><p><b>Answer</b>: -Without that extra grace period, memory reordering could result in -<tt>do_something_dlm()</tt> executing <tt>do_something()</tt> -concurrently with the last bits of <tt>recovery()</tt>. - - -</p><p><a href="#Quick%20Quiz%202"><b>Back to Quick Quiz 2</b>.</a> - -<a name="qq3answer"></a> -<p><b>Quick Quiz 3</b>: -But <tt>rcu_assign_pointer()</tt> does nothing to prevent the -two assignments to <tt>p->a</tt> and <tt>p->b</tt> -from being reordered. -Can't that also cause problems? - - -</p><p><b>Answer</b>: -No, it cannot. -The readers cannot see either of these two fields until -the assignment to <tt>gp</tt>, by which time both fields are -fully initialized. -So reordering the assignments -to <tt>p->a</tt> and <tt>p->b</tt> cannot possibly -cause any problems. - - -</p><p><a href="#Quick%20Quiz%203"><b>Back to Quick Quiz 3</b>.</a> - -<a name="qq4answer"></a> -<p><b>Quick Quiz 4</b>: -Without the <tt>rcu_dereference()</tt> or the -<tt>rcu_access_pointer()</tt>, what destructive optimizations -might the compiler make use of? - - -</p><p><b>Answer</b>: -Let's start with what happens to <tt>do_something_gp()</tt> -if it fails to use <tt>rcu_dereference()</tt>. -It could reuse a value formerly fetched from this same pointer. -It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time -manner, resulting in <i>load tearing</i>, in turn resulting a bytewise -mash-up of two distince pointer values. -It might even use value-speculation optimizations, where it makes a wrong -guess, but by the time it gets around to checking the value, an update -has changed the pointer to match the wrong guess. -Too bad about any dereferences that returned pre-initialization garbage -in the meantime! - -<p> -For <tt>remove_gp_synchronous()</tt>, as long as all modifications -to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>, -the above optimizations are harmless. -However, -with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>, -<tt>sparse</tt> will complain if you -define <tt>gp</tt> with <tt>__rcu</tt> and then -access it without using -either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>. - - -</p><p><a href="#Quick%20Quiz%204"><b>Back to Quick Quiz 4</b>.</a> - -<a name="qq5answer"></a> -<p><b>Quick Quiz 5</b>: -Given that multiple CPUs can start RCU read-side critical sections -at any time without any ordering whatsoever, how can RCU possibly tell whether -or not a given RCU read-side critical section starts before a -given instance of <tt>synchronize_rcu()</tt>? - - -</p><p><b>Answer</b>: -If RCU cannot tell whether or not a given -RCU read-side critical section starts before a -given instance of <tt>synchronize_rcu()</tt>, -then it must assume that the RCU read-side critical section -started first. -In other words, a given instance of <tt>synchronize_rcu()</tt> -can avoid waiting on a given RCU read-side critical section only -if it can prove that <tt>synchronize_rcu()</tt> started first. - - -</p><p><a href="#Quick%20Quiz%205"><b>Back to Quick Quiz 5</b>.</a> - -<a name="qq6answer"></a> -<p><b>Quick Quiz 6</b>: -The first and second guarantees require unbelievably strict ordering! -Are all these memory barriers <i> really</i> required? - - -</p><p><b>Answer</b>: -Yes, they really are required. -To see why the first guarantee is required, consider the following -sequence of events: - -<ol> -<li> CPU 1: <tt>rcu_read_lock()</tt> -<li> CPU 1: <tt>q = rcu_dereference(gp); - /* Very likely to return p. */</tt> -<li> CPU 0: <tt>list_del_rcu(p);</tt> -<li> CPU 0: <tt>synchronize_rcu()</tt> starts. -<li> CPU 1: <tt>do_something_with(q->a); - /* No smp_mb(), so might happen after kfree(). */</tt> -<li> CPU 1: <tt>rcu_read_unlock()</tt> -<li> CPU 0: <tt>synchronize_rcu()</tt> returns. -<li> CPU 0: <tt>kfree(p);</tt> -</ol> - -<p> -Therefore, there absolutely must be a full memory barrier between the -end of the RCU read-side critical section and the end of the -grace period. - -<p> -The sequence of events demonstrating the necessity of the second rule -is roughly similar: - -<ol> -<li> CPU 0: <tt>list_del_rcu(p);</tt> -<li> CPU 0: <tt>synchronize_rcu()</tt> starts. -<li> CPU 1: <tt>rcu_read_lock()</tt> -<li> CPU 1: <tt>q = rcu_dereference(gp); - /* Might return p if no memory barrier. */</tt> -<li> CPU 0: <tt>synchronize_rcu()</tt> returns. -<li> CPU 0: <tt>kfree(p);</tt> -<li> CPU 1: <tt>do_something_with(q->a); /* Boom!!! */</tt> -<li> CPU 1: <tt>rcu_read_unlock()</tt> -</ol> - -<p> -And similarly, without a memory barrier between the beginning of the -grace period and the beginning of the RCU read-side critical section, -CPU 1 might end up accessing the freelist. - -<p> -The “as if” rule of course applies, so that any implementation -that acts as if the appropriate memory barriers were in place is a -correct implementation. -That said, it is much easier to fool yourself into believing that you have -adhered to the as-if rule than it is to actually adhere to it! - - -</p><p><a href="#Quick%20Quiz%206"><b>Back to Quick Quiz 6</b>.</a> - -<a name="qq7answer"></a> -<p><b>Quick Quiz 7</b>: -But how does the upgrade-to-write operation exclude other readers? - - -</p><p><b>Answer</b>: -It doesn't, just like normal RCU updates, which also do not exclude -RCU readers. - - -</p><p><a href="#Quick%20Quiz%207"><b>Back to Quick Quiz 7</b>.</a> - -<a name="qq8answer"></a> -<p><b>Quick Quiz 8</b>: -Can't the compiler also reorder this code? - - -</p><p><b>Answer</b>: -No, the volatile casts in <tt>READ_ONCE()</tt> and -<tt>WRITE_ONCE()</tt> prevent the compiler from reordering in -this particular case. - - -</p><p><a href="#Quick%20Quiz%208"><b>Back to Quick Quiz 8</b>.</a> - -<a name="qq9answer"></a> -<p><b>Quick Quiz 9</b>: -Suppose that synchronize_rcu() did wait until all readers had completed. -Would the updater be able to rely on this? - - -</p><p><b>Answer</b>: -No. -Even if <tt>synchronize_rcu()</tt> were to wait until -all readers had completed, a new reader might start immediately after -<tt>synchronize_rcu()</tt> completed. -Therefore, the code following -<tt>synchronize_rcu()</tt> cannot rely on there being no readers -in any case. - - -</p><p><a href="#Quick%20Quiz%209"><b>Back to Quick Quiz 9</b>.</a> - -<a name="qq10answer"></a> -<p><b>Quick Quiz 10</b>: -How long a sequence of grace periods, each separated by an RCU read-side -critical section, would be required to partition the RCU read-side -critical sections at the beginning and end of the chain? - - -</p><p><b>Answer</b>: -In theory, an infinite number. -In practice, an unknown number that is sensitive to both implementation -details and timing considerations. -Therefore, even in practice, RCU users must abide by the theoretical rather -than the practical answer. - - -</p><p><a href="#Quick%20Quiz%2010"><b>Back to Quick Quiz 10</b>.</a> - -<a name="qq11answer"></a> -<p><b>Quick Quiz 11</b>: -What about sleeping locks? - - -</p><p><b>Answer</b>: -These are forbidden within Linux-kernel RCU read-side critical sections -because it is not legal to place a quiescent state (in this case, -voluntary context switch) within an RCU read-side critical section. -However, sleeping locks may be used within userspace RCU read-side critical -sections, and also within Linux-kernel sleepable RCU -<a href="#Sleepable RCU">(SRCU)</a> -read-side critical sections. -In addition, the -rt patchset turns spinlocks into a sleeping locks so -that the corresponding critical sections can be preempted, which -also means that these sleeplockified spinlocks (but not other sleeping locks!) -may be acquire within -rt-Linux-kernel RCU read-side critical sections. - -<p> -Note that it <i>is</i> legal for a normal RCU read-side critical section -to conditionally acquire a sleeping locks (as in <tt>mutex_trylock()</tt>), -but only as long as it does not loop indefinitely attempting to -conditionally acquire that sleeping locks. -The key point is that things like <tt>mutex_trylock()</tt> -either return with the mutex held, or return an error indication if -the mutex was not immediately available. -Either way, <tt>mutex_trylock()</tt> returns immediately without sleeping. - - -</p><p><a href="#Quick%20Quiz%2011"><b>Back to Quick Quiz 11</b>.</a> - -<a name="qq12answer"></a> -<p><b>Quick Quiz 12</b>: -Why does line 19 use <tt>rcu_access_pointer()</tt>? -After all, <tt>call_rcu()</tt> on line 25 stores into the -structure, which would interact badly with concurrent insertions. -Doesn't this mean that <tt>rcu_dereference()</tt> is required? - - -</p><p><b>Answer</b>: -Presumably the <tt>->gp_lock</tt> acquired on line 18 excludes -any changes, including any insertions that <tt>rcu_dereference()</tt> -would protect against. -Therefore, any insertions will be delayed until after <tt>->gp_lock</tt> -is released on line 25, which in turn means that -<tt>rcu_access_pointer()</tt> suffices. - - -</p><p><a href="#Quick%20Quiz%2012"><b>Back to Quick Quiz 12</b>.</a> - -<a name="qq13answer"></a> -<p><b>Quick Quiz 13</b>: -Earlier it was claimed that <tt>call_rcu()</tt> and -<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked -by readers. -But how can that be correct, given that the invocation of the callback -and the freeing of the memory (respectively) must still wait for -a grace period to elapse? - - -</p><p><b>Answer</b>: -We could define things this way, but keep in mind that this sort of -definition would say that updates in garbage-collected languages -cannot complete until the next time the garbage collector runs, -which does not seem at all reasonable. -The key point is that in most cases, an updater using either -<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the -next update as soon as it has invoked <tt>call_rcu()</tt> or -<tt>kfree_rcu()</tt>, without having to wait for a subsequent -grace period. - - -</p><p><a href="#Quick%20Quiz%2013"><b>Back to Quick Quiz 13</b>.</a> - -<a name="qq14answer"></a> -<p><b>Quick Quiz 14</b>: -So what happens with <tt>synchronize_rcu()</tt> during -scheduler initialization for <tt>CONFIG_PREEMPT=n</tt> -kernels? - - -</p><p><b>Answer</b>: -In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt> -maps directly to <tt>synchronize_sched()</tt>. -Therefore, <tt>synchronize_rcu()</tt> works normally throughout -boot in <tt>CONFIG_PREEMPT=n</tt> kernels. -However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels, -so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt> -during scheduler initialization. - - -</p><p><a href="#Quick%20Quiz%2014"><b>Back to Quick Quiz 14</b>.</a> - - </body></html> diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx deleted file mode 100644 index 3a97ba490c42..000000000000 --- a/Documentation/RCU/Design/Requirements/Requirements.htmlx +++ /dev/null @@ -1,2741 +0,0 @@ -<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" - "http://www.w3.org/TR/html4/loose.dtd"> - <html> - <head><title>A Tour Through RCU's Requirements [LWN.net]</title> - <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf-8"> - -<h1>A Tour Through RCU's Requirements</h1> - -<p>Copyright IBM Corporation, 2015</p> -<p>Author: Paul E. McKenney</p> -<p><i>The initial version of this document appeared in the -<a href="https://lwn.net/">LWN</a> articles -<a href="https://lwn.net/Articles/652156/">here</a>, -<a href="https://lwn.net/Articles/652677/">here</a>, and -<a href="https://lwn.net/Articles/653326/">here</a>.</i></p> - -<h2>Introduction</h2> - -<p> -Read-copy update (RCU) is a synchronization mechanism that is often -used as a replacement for reader-writer locking. -RCU is unusual in that updaters do not block readers, -which means that RCU's read-side primitives can be exceedingly fast -and scalable. -In addition, updaters can make useful forward progress concurrently -with readers. -However, all this concurrency between RCU readers and updaters does raise -the question of exactly what RCU readers are doing, which in turn -raises the question of exactly what RCU's requirements are. - -<p> -This document therefore summarizes RCU's requirements, and can be thought -of as an informal, high-level specification for RCU. -It is important to understand that RCU's specification is primarily -empirical in nature; -in fact, I learned about many of these requirements the hard way. -This situation might cause some consternation, however, not only -has this learning process been a lot of fun, but it has also been -a great privilege to work with so many people willing to apply -technologies in interesting new ways. - -<p> -All that aside, here are the categories of currently known RCU requirements: -</p> - -<ol> -<li> <a href="#Fundamental Requirements"> - Fundamental Requirements</a> -<li> <a href="#Fundamental Non-Requirements">Fundamental Non-Requirements</a> -<li> <a href="#Parallelism Facts of Life"> - Parallelism Facts of Life</a> -<li> <a href="#Quality-of-Implementation Requirements"> - Quality-of-Implementation Requirements</a> -<li> <a href="#Linux Kernel Complications"> - Linux Kernel Complications</a> -<li> <a href="#Software-Engineering Requirements"> - Software-Engineering Requirements</a> -<li> <a href="#Other RCU Flavors"> - Other RCU Flavors</a> -<li> <a href="#Possible Future Changes"> - Possible Future Changes</a> -</ol> - -<p> -This is followed by a <a href="#Summary">summary</a>, -which is in turn followed by the inevitable -<a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>. - -<h2><a name="Fundamental Requirements">Fundamental Requirements</a></h2> - -<p> -RCU's fundamental requirements are the closest thing RCU has to hard -mathematical requirements. -These are: - -<ol> -<li> <a href="#Grace-Period Guarantee"> - Grace-Period Guarantee</a> -<li> <a href="#Publish-Subscribe Guarantee"> - Publish-Subscribe Guarantee</a> -<li> <a href="#Memory-Barrier Guarantees"> - Memory-Barrier Guarantees</a> -<li> <a href="#RCU Primitives Guaranteed to Execute Unconditionally"> - RCU Primitives Guaranteed to Execute Unconditionally</a> -<li> <a href="#Guaranteed Read-to-Write Upgrade"> - Guaranteed Read-to-Write Upgrade</a> -</ol> - -<h3><a name="Grace-Period Guarantee">Grace-Period Guarantee</a></h3> - -<p> -RCU's grace-period guarantee is unusual in being premeditated: -Jack Slingwine and I had this guarantee firmly in mind when we started -work on RCU (then called “rclock”) in the early 1990s. -That said, the past two decades of experience with RCU have produced -a much more detailed understanding of this guarantee. - -<p> -RCU's grace-period guarantee allows updaters to wait for the completion -of all pre-existing RCU read-side critical sections. -An RCU read-side critical section -begins with the marker <tt>rcu_read_lock()</tt> and ends with -the marker <tt>rcu_read_unlock()</tt>. -These markers may be nested, and RCU treats a nested set as one -big RCU read-side critical section. -Production-quality implementations of <tt>rcu_read_lock()</tt> and -<tt>rcu_read_unlock()</tt> are extremely lightweight, and in -fact have exactly zero overhead in Linux kernels built for production -use with <tt>CONFIG_PREEMPT=n</tt>. - -<p> -This guarantee allows ordering to be enforced with extremely low -overhead to readers, for example: - -<blockquote> -<pre> - 1 int x, y; - 2 - 3 void thread0(void) - 4 { - 5 rcu_read_lock(); - 6 r1 = READ_ONCE(x); - 7 r2 = READ_ONCE(y); - 8 rcu_read_unlock(); - 9 } -10 -11 void thread1(void) -12 { -13 WRITE_ONCE(x, 1); -14 synchronize_rcu(); -15 WRITE_ONCE(y, 1); -16 } -</pre> -</blockquote> - -<p> -Because the <tt>synchronize_rcu()</tt> on line 14 waits for -all pre-existing readers, any instance of <tt>thread0()</tt> that -loads a value of zero from <tt>x</tt> must complete before -<tt>thread1()</tt> stores to <tt>y</tt>, so that instance must -also load a value of zero from <tt>y</tt>. -Similarly, any instance of <tt>thread0()</tt> that loads a value of -one from <tt>y</tt> must have started after the -<tt>synchronize_rcu()</tt> started, and must therefore also load -a value of one from <tt>x</tt>. -Therefore, the outcome: -<blockquote> -<pre> -(r1 == 0 && r2 == 1) -</pre> -</blockquote> -cannot happen. - -<p>@@QQ@@ -Wait a minute! -You said that updaters can make useful forward progress concurrently -with readers, but pre-existing readers will block -<tt>synchronize_rcu()</tt>!!! -Just who are you trying to fool??? -<p>@@QQA@@ -First, if updaters do not wish to be blocked by readers, they can use -<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will -be discussed later. -Second, even when using <tt>synchronize_rcu()</tt>, the other -update-side code does run concurrently with readers, whether pre-existing -or not. -<p>@@QQE@@ - -<p> -This scenario resembles one of the first uses of RCU in -<a href="https://en.wikipedia.org/wiki/DYNIX">DYNIX/ptx</a>, -which managed a distributed lock manager's transition into -a state suitable for handling recovery from node failure, -more or less as follows: - -<blockquote> -<pre> - 1 #define STATE_NORMAL 0 - 2 #define STATE_WANT_RECOVERY 1 - 3 #define STATE_RECOVERING 2 - 4 #define STATE_WANT_NORMAL 3 - 5 - 6 int state = STATE_NORMAL; - 7 - 8 void do_something_dlm(void) - 9 { -10 int state_snap; -11 -12 rcu_read_lock(); -13 state_snap = READ_ONCE(state); -14 if (state_snap == STATE_NORMAL) -15 do_something(); -16 else -17 do_something_carefully(); -18 rcu_read_unlock(); -19 } -20 -21 void start_recovery(void) -22 { -23 WRITE_ONCE(state, STATE_WANT_RECOVERY); -24 synchronize_rcu(); -25 WRITE_ONCE(state, STATE_RECOVERING); -26 recovery(); -27 WRITE_ONCE(state, STATE_WANT_NORMAL); -28 synchronize_rcu(); -29 WRITE_ONCE(state, STATE_NORMAL); -30 } -</pre> -</blockquote> - -<p> -The RCU read-side critical section in <tt>do_something_dlm()</tt> -works with the <tt>synchronize_rcu()</tt> in <tt>start_recovery()</tt> -to guarantee that <tt>do_something()</tt> never runs concurrently -with <tt>recovery()</tt>, but with little or no synchronization -overhead in <tt>do_something_dlm()</tt>. - -<p>@@QQ@@ -Why is the <tt>synchronize_rcu()</tt> on line 28 needed? -<p>@@QQA@@ -Without that extra grace period, memory reordering could result in -<tt>do_something_dlm()</tt> executing <tt>do_something()</tt> -concurrently with the last bits of <tt>recovery()</tt>. -<p>@@QQE@@ - -<p> -In order to avoid fatal problems such as deadlocks, -an RCU read-side critical section must not contain calls to -<tt>synchronize_rcu()</tt>. -Similarly, an RCU read-side critical section must not -contain anything that waits, directly or indirectly, on completion of -an invocation of <tt>synchronize_rcu()</tt>. - -<p> -Although RCU's grace-period guarantee is useful in and of itself, with -<a href="https://lwn.net/Articles/573497/">quite a few use cases</a>, -it would be good to be able to use RCU to coordinate read-side -access to linked data structures. -For this, the grace-period guarantee is not sufficient, as can -be seen in function <tt>add_gp_buggy()</tt> below. -We will look at the reader's code later, but in the meantime, just think of -the reader as locklessly picking up the <tt>gp</tt> pointer, -and, if the value loaded is non-<tt>NULL</tt>, locklessly accessing the -<tt>->a</tt> and <tt>->b</tt> fields. - -<blockquote> -<pre> - 1 bool add_gp_buggy(int a, int b) - 2 { - 3 p = kmalloc(sizeof(*p), GFP_KERNEL); - 4 if (!p) - 5 return -ENOMEM; - 6 spin_lock(&gp_lock); - 7 if (rcu_access_pointer(gp)) { - 8 spin_unlock(&gp_lock); - 9 return false; -10 } -11 p->a = a; -12 p->b = a; -13 gp = p; /* ORDERING BUG */ -14 spin_unlock(&gp_lock); -15 return true; -16 } -</pre> -</blockquote> - -<p> -The problem is that both the compiler and weakly ordered CPUs are within -their rights to reorder this code as follows: - -<blockquote> -<pre> - 1 bool add_gp_buggy_optimized(int a, int b) - 2 { - 3 p = kmalloc(sizeof(*p), GFP_KERNEL); - 4 if (!p) - 5 return -ENOMEM; - 6 spin_lock(&gp_lock); - 7 if (rcu_access_pointer(gp)) { - 8 spin_unlock(&gp_lock); - 9 return false; -10 } -<b>11 gp = p; /* ORDERING BUG */ -12 p->a = a; -13 p->b = a;</b> -14 spin_unlock(&gp_lock); -15 return true; -16 } -</pre> -</blockquote> - -<p> -If an RCU reader fetches <tt>gp</tt> just after -<tt>add_gp_buggy_optimized</tt> executes line 11, -it will see garbage in the <tt>->a</tt> and <tt>->b</tt> -fields. -And this is but one of many ways in which compiler and hardware optimizations -could cause trouble. -Therefore, we clearly need some way to prevent the compiler and the CPU from -reordering in this manner, which brings us to the publish-subscribe -guarantee discussed in the next section. - -<h3><a name="Publish-Subscribe Guarantee">Publish/Subscribe Guarantee</a></h3> - -<p> -RCU's publish-subscribe guarantee allows data to be inserted -into a linked data structure without disrupting RCU readers. -The updater uses <tt>rcu_assign_pointer()</tt> to insert the -new data, and readers use <tt>rcu_dereference()</tt> to -access data, whether new or old. -The following shows an example of insertion: - -<blockquote> -<pre> - 1 bool add_gp(int a, int b) - 2 { - 3 p = kmalloc(sizeof(*p), GFP_KERNEL); - 4 if (!p) - 5 return -ENOMEM; - 6 spin_lock(&gp_lock); - 7 if (rcu_access_pointer(gp)) { - 8 spin_unlock(&gp_lock); - 9 return false; -10 } -11 p->a = a; -12 p->b = a; -13 rcu_assign_pointer(gp, p); -14 spin_unlock(&gp_lock); -15 return true; -16 } -</pre> -</blockquote> - -<p> -The <tt>rcu_assign_pointer()</tt> on line 13 is conceptually -equivalent to a simple assignment statement, but also guarantees -that its assignment will -happen after the two assignments in lines 11 and 12, -similar to the C11 <tt>memory_order_release</tt> store operation. -It also prevents any number of “interesting” compiler -optimizations, for example, the use of <tt>gp</tt> as a scratch -location immediately preceding the assignment. - -<p>@@QQ@@ -But <tt>rcu_assign_pointer()</tt> does nothing to prevent the -two assignments to <tt>p->a</tt> and <tt>p->b</tt> -from being reordered. -Can't that also cause problems? -<p>@@QQA@@ -No, it cannot. -The readers cannot see either of these two fields until -the assignment to <tt>gp</tt>, by which time both fields are -fully initialized. -So reordering the assignments -to <tt>p->a</tt> and <tt>p->b</tt> cannot possibly -cause any problems. -<p>@@QQE@@ - -<p> -It is tempting to assume that the reader need not do anything special -to control its accesses to the RCU-protected data, -as shown in <tt>do_something_gp_buggy()</tt> below: - -<blockquote> -<pre> - 1 bool do_something_gp_buggy(void) - 2 { - 3 rcu_read_lock(); - 4 p = gp; /* OPTIMIZATIONS GALORE!!! */ - 5 if (p) { - 6 do_something(p->a, p->b); - 7 rcu_read_unlock(); - 8 return true; - 9 } -10 rcu_read_unlock(); -11 return false; -12 } -</pre> -</blockquote> - -<p> -However, this temptation must be resisted because there are a -surprisingly large number of ways that the compiler -(to say nothing of -<a href="https://h71000.www7.hp.com/wizard/wiz_2637.html">DEC Alpha CPUs</a>) -can trip this code up. -For but one example, if the compiler were short of registers, it -might choose to refetch from <tt>gp</tt> rather than keeping -a separate copy in <tt>p</tt> as follows: - -<blockquote> -<pre> - 1 bool do_something_gp_buggy_optimized(void) - 2 { - 3 rcu_read_lock(); - 4 if (gp) { /* OPTIMIZATIONS GALORE!!! */ -<b> 5 do_something(gp->a, gp->b);</b> - 6 rcu_read_unlock(); - 7 return true; - 8 } - 9 rcu_read_unlock(); -10 return false; -11 } -</pre> -</blockquote> - -<p> -If this function ran concurrently with a series of updates that -replaced the current structure with a new one, -the fetches of <tt>gp->a</tt> -and <tt>gp->b</tt> might well come from two different structures, -which could cause serious confusion. -To prevent this (and much else besides), <tt>do_something_gp()</tt> uses -<tt>rcu_dereference()</tt> to fetch from <tt>gp</tt>: - -<blockquote> -<pre> - 1 bool do_something_gp(void) - 2 { - 3 rcu_read_lock(); - 4 p = rcu_dereference(gp); - 5 if (p) { - 6 do_something(p->a, p->b); - 7 rcu_read_unlock(); - 8 return true; - 9 } -10 rcu_read_unlock(); -11 return false; -12 } -</pre> -</blockquote> - -<p> -The <tt>rcu_dereference()</tt> uses volatile casts and (for DEC Alpha) -memory barriers in the Linux kernel. -Should a -<a href="http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf">high-quality implementation of C11 <tt>memory_order_consume</tt> [PDF]</a> -ever appear, then <tt>rcu_dereference()</tt> could be implemented -as a <tt>memory_order_consume</tt> load. -Regardless of the exact implementation, a pointer fetched by -<tt>rcu_dereference()</tt> may not be used outside of the -outermost RCU read-side critical section containing that -<tt>rcu_dereference()</tt>, unless protection of -the corresponding data element has been passed from RCU to some -other synchronization mechanism, most commonly locking or -<a href="https://www.kernel.org/doc/Documentation/RCU/rcuref.txt">reference counting</a>. - -<p> -In short, updaters use <tt>rcu_assign_pointer()</tt> and readers -use <tt>rcu_dereference()</tt>, and these two RCU API elements -work together to ensure that readers have a consistent view of -newly added data elements. - -<p> -Of course, it is also necessary to remove elements from RCU-protected -data structures, for example, using the following process: - -<ol> -<li> Remove the data element from the enclosing structure. -<li> Wait for all pre-existing RCU read-side critical sections - to complete (because only pre-existing readers can possibly have - a reference to the newly removed data element). -<li> At this point, only the updater has a reference to the - newly removed data element, so it can safely reclaim - the data element, for example, by passing it to <tt>kfree()</tt>. -</ol> - -This process is implemented by <tt>remove_gp_synchronous()</tt>: - -<blockquote> -<pre> - 1 bool remove_gp_synchronous(void) - 2 { - 3 struct foo *p; - 4 - 5 spin_lock(&gp_lock); - 6 p = rcu_access_pointer(gp); - 7 if (!p) { - 8 spin_unlock(&gp_lock); - 9 return false; -10 } -11 rcu_assign_pointer(gp, NULL); -12 spin_unlock(&gp_lock); -13 synchronize_rcu(); -14 kfree(p); -15 return true; -16 } -</pre> -</blockquote> - -<p> -This function is straightforward, with line 13 waiting for a grace -period before line 14 frees the old data element. -This waiting ensures that readers will reach line 7 of -<tt>do_something_gp()</tt> before the data element referenced by -<tt>p</tt> is freed. -The <tt>rcu_access_pointer()</tt> on line 6 is similar to -<tt>rcu_dereference()</tt>, except that: - -<ol> -<li> The value returned by <tt>rcu_access_pointer()</tt> - cannot be dereferenced. - If you want to access the value pointed to as well as - the pointer itself, use <tt>rcu_dereference()</tt> - instead of <tt>rcu_access_pointer()</tt>. -<li> The call to <tt>rcu_access_pointer()</tt> need not be - protected. - In contrast, <tt>rcu_dereference()</tt> must either be - within an RCU read-side critical section or in a code - segment where the pointer cannot change, for example, in - code protected by the corresponding update-side lock. -</ol> - -<p>@@QQ@@ -Without the <tt>rcu_dereference()</tt> or the -<tt>rcu_access_pointer()</tt>, what destructive optimizations -might the compiler make use of? -<p>@@QQA@@ -Let's start with what happens to <tt>do_something_gp()</tt> -if it fails to use <tt>rcu_dereference()</tt>. -It could reuse a value formerly fetched from this same pointer. -It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time -manner, resulting in <i>load tearing</i>, in turn resulting a bytewise -mash-up of two distince pointer values. -It might even use value-speculation optimizations, where it makes a wrong -guess, but by the time it gets around to checking the value, an update -has changed the pointer to match the wrong guess. -Too bad about any dereferences that returned pre-initialization garbage -in the meantime! - -<p> -For <tt>remove_gp_synchronous()</tt>, as long as all modifications -to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>, -the above optimizations are harmless. -However, -with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>, -<tt>sparse</tt> will complain if you -define <tt>gp</tt> with <tt>__rcu</tt> and then -access it without using -either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>. -<p>@@QQE@@ - -<p> -In short, RCU's publish-subscribe guarantee is provided by the combination -of <tt>rcu_assign_pointer()</tt> and <tt>rcu_dereference()</tt>. -This guarantee allows data elements to be safely added to RCU-protected -linked data structures without disrupting RCU readers. -This guarantee can be used in combination with the grace-period -guarantee to also allow data elements to be removed from RCU-protected -linked data structures, again without disrupting RCU readers. - -<p> -This guarantee was only partially premeditated. -DYNIX/ptx used an explicit memory barrier for publication, but had nothing -resembling <tt>rcu_dereference()</tt> for subscription, nor did it -have anything resembling the <tt>smp_read_barrier_depends()</tt> -that was later subsumed into <tt>rcu_dereference()</tt>. -The need for these operations made itself known quite suddenly at a -late-1990s meeting with the DEC Alpha architects, back in the days when -DEC was still a free-standing company. -It took the Alpha architects a good hour to convince me that any sort -of barrier would ever be needed, and it then took me a good <i>two</i> hours -to convince them that their documentation did not make this point clear. -More recent work with the C and C++ standards committees have provided -much education on tricks and traps from the compiler. -In short, compilers were much less tricky in the early 1990s, but in -2015, don't even think about omitting <tt>rcu_dereference()</tt>! - -<h3><a name="Memory-Barrier Guarantees">Memory-Barrier Guarantees</a></h3> - -<p> -The previous section's simple linked-data-structure scenario clearly -demonstrates the need for RCU's stringent memory-ordering guarantees on -systems with more than one CPU: - -<ol> -<li> Each CPU that has an RCU read-side critical section that - begins before <tt>synchronize_rcu()</tt> starts is - guaranteed to execute a full memory barrier between the time - that the RCU read-side critical section ends and the time that - <tt>synchronize_rcu()</tt> returns. - Without this guarantee, a pre-existing RCU read-side critical section - might hold a reference to the newly removed <tt>struct foo</tt> - after the <tt>kfree()</tt> on line 14 of - <tt>remove_gp_synchronous()</tt>. -<li> Each CPU that has an RCU read-side critical section that ends - after <tt>synchronize_rcu()</tt> returns is guaranteed - to execute a full memory barrier between the time that - <tt>synchronize_rcu()</tt> begins and the time that the RCU - read-side critical section begins. - Without this guarantee, a later RCU read-side critical section - running after the <tt>kfree()</tt> on line 14 of - <tt>remove_gp_synchronous()</tt> might - later run <tt>do_something_gp()</tt> and find the - newly deleted <tt>struct foo</tt>. -<li> If the task invoking <tt>synchronize_rcu()</tt> remains - on a given CPU, then that CPU is guaranteed to execute a full - memory barrier sometime during the execution of - <tt>synchronize_rcu()</tt>. - This guarantee ensures that the <tt>kfree()</tt> on - line 14 of <tt>remove_gp_synchronous()</tt> really does - execute after the removal on line 11. -<li> If the task invoking <tt>synchronize_rcu()</tt> migrates - among a group of CPUs during that invocation, then each of the - CPUs in that group is guaranteed to execute a full memory barrier - sometime during the execution of <tt>synchronize_rcu()</tt>. - This guarantee also ensures that the <tt>kfree()</tt> on - line 14 of <tt>remove_gp_synchronous()</tt> really does - execute after the removal on - line 11, but also in the case where the thread executing the - <tt>synchronize_rcu()</tt> migrates in the meantime. -</ol> - -<p>@@QQ@@ -Given that multiple CPUs can start RCU read-side critical sections -at any time without any ordering whatsoever, how can RCU possibly tell whether -or not a given RCU read-side critical section starts before a -given instance of <tt>synchronize_rcu()</tt>? -<p>@@QQA@@ -If RCU cannot tell whether or not a given -RCU read-side critical section starts before a -given instance of <tt>synchronize_rcu()</tt>, -then it must assume that the RCU read-side critical section -started first. -In other words, a given instance of <tt>synchronize_rcu()</tt> -can avoid waiting on a given RCU read-side critical section only -if it can prove that <tt>synchronize_rcu()</tt> started first. -<p>@@QQE@@ - -<p>@@QQ@@ -The first and second guarantees require unbelievably strict ordering! -Are all these memory barriers <i> really</i> required? -<p>@@QQA@@ -Yes, they really are required. -To see why the first guarantee is required, consider the following -sequence of events: - -<ol> -<li> CPU 1: <tt>rcu_read_lock()</tt> -<li> CPU 1: <tt>q = rcu_dereference(gp); - /* Very likely to return p. */</tt> -<li> CPU 0: <tt>list_del_rcu(p);</tt> -<li> CPU 0: <tt>synchronize_rcu()</tt> starts. -<li> CPU 1: <tt>do_something_with(q->a); - /* No smp_mb(), so might happen after kfree(). */</tt> -<li> CPU 1: <tt>rcu_read_unlock()</tt> -<li> CPU 0: <tt>synchronize_rcu()</tt> returns. -<li> CPU 0: <tt>kfree(p);</tt> -</ol> - -<p> -Therefore, there absolutely must be a full memory barrier between the -end of the RCU read-side critical section and the end of the -grace period. - -<p> -The sequence of events demonstrating the necessity of the second rule -is roughly similar: - -<ol> -<li> CPU 0: <tt>list_del_rcu(p);</tt> -<li> CPU 0: <tt>synchronize_rcu()</tt> starts. -<li> CPU 1: <tt>rcu_read_lock()</tt> -<li> CPU 1: <tt>q = rcu_dereference(gp); - /* Might return p if no memory barrier. */</tt> -<li> CPU 0: <tt>synchronize_rcu()</tt> returns. -<li> CPU 0: <tt>kfree(p);</tt> -<li> CPU 1: <tt>do_something_with(q->a); /* Boom!!! */</tt> -<li> CPU 1: <tt>rcu_read_unlock()</tt> -</ol> - -<p> -And similarly, without a memory barrier between the beginning of the -grace period and the beginning of the RCU read-side critical section, -CPU 1 might end up accessing the freelist. - -<p> -The “as if” rule of course applies, so that any implementation -that acts as if the appropriate memory barriers were in place is a -correct implementation. -That said, it is much easier to fool yourself into believing that you have -adhered to the as-if rule than it is to actually adhere to it! -<p>@@QQE@@ - -<p> -Note that these memory-barrier requirements do not replace the fundamental -RCU requirement that a grace period wait for all pre-existing readers. -On the contrary, the memory barriers called out in this section must operate in -such a way as to <i>enforce</i> this fundamental requirement. -Of course, different implementations enforce this requirement in different -ways, but enforce it they must. - -<h3><a name="RCU Primitives Guaranteed to Execute Unconditionally">RCU Primitives Guaranteed to Execute Unconditionally</a></h3> - -<p> -The common-case RCU primitives are unconditional. -They are invoked, they do their job, and they return, with no possibility -of error, and no need to retry. -This is a key RCU design philosophy. - -<p> -However, this philosophy is pragmatic rather than pigheaded. -If someone comes up with a good justification for a particular conditional -RCU primitive, it might well be implemented and added. -After all, this guarantee was reverse-engineered, not premeditated. -The unconditional nature of the RCU primitives was initially an -accident of implementation, and later experience with synchronization -primitives with conditional primitives caused me to elevate this -accident to a guarantee. -Therefore, the justification for adding a conditional primitive to -RCU would need to be based on detailed and compelling use cases. - -<h3><a name="Guaranteed Read-to-Write Upgrade">Guaranteed Read-to-Write Upgrade</a></h3> - -<p> -As far as RCU is concerned, it is always possible to carry out an -update within an RCU read-side critical section. -For example, that RCU read-side critical section might search for -a given data element, and then might acquire the update-side -spinlock in order to update that element, all while remaining -in that RCU read-side critical section. -Of course, it is necessary to exit the RCU read-side critical section -before invoking <tt>synchronize_rcu()</tt>, however, this -inconvenience can be avoided through use of the -<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt> API members -described later in this document. - -<p>@@QQ@@ -But how does the upgrade-to-write operation exclude other readers? -<p>@@QQA@@ -It doesn't, just like normal RCU updates, which also do not exclude -RCU readers. -<p>@@QQE@@ - -<p> -This guarantee allows lookup code to be shared between read-side -and update-side code, and was premeditated, appearing in the earliest -DYNIX/ptx RCU documentation. - -<h2><a name="Fundamental Non-Requirements">Fundamental Non-Requirements</a></h2> - -<p> -RCU provides extremely lightweight readers, and its read-side guarantees, -though quite useful, are correspondingly lightweight. -It is therefore all too easy to assume that RCU is guaranteeing more -than it really is. -Of course, the list of things that RCU does not guarantee is infinitely -long, however, the following sections list a few non-guarantees that -have caused confusion. -Except where otherwise noted, these non-guarantees were premeditated. - -<ol> -<li> <a href="#Readers Impose Minimal Ordering"> - Readers Impose Minimal Ordering</a> -<li> <a href="#Readers Do Not Exclude Updaters"> - Readers Do Not Exclude Updaters</a> -<li> <a href="#Updaters Only Wait For Old Readers"> - Updaters Only Wait For Old Readers</a> -<li> <a href="#Grace Periods Don't Partition Read-Side Critical Sections"> - Grace Periods Don't Partition Read-Side Critical Sections</a> -<li> <a href="#Read-Side Critical Sections Don't Partition Grace Periods"> - Read-Side Critical Sections Don't Partition Grace Periods</a> -<li> <a href="#Disabling Preemption Does Not Block Grace Periods"> - Disabling Preemption Does Not Block Grace Periods</a> -</ol> - -<h3><a name="Readers Impose Minimal Ordering">Readers Impose Minimal Ordering</a></h3> - -<p> -Reader-side markers such as <tt>rcu_read_lock()</tt> and -<tt>rcu_read_unlock()</tt> provide absolutely no ordering guarantees -except through their interaction with the grace-period APIs such as -<tt>synchronize_rcu()</tt>. -To see this, consider the following pair of threads: - -<blockquote> -<pre> - 1 void thread0(void) - 2 { - 3 rcu_read_lock(); - 4 WRITE_ONCE(x, 1); - 5 rcu_read_unlock(); - 6 rcu_read_lock(); - 7 WRITE_ONCE(y, 1); - 8 rcu_read_unlock(); - 9 } -10 -11 void thread1(void) -12 { -13 rcu_read_lock(); -14 r1 = READ_ONCE(y); -15 rcu_read_unlock(); -16 rcu_read_lock(); -17 r2 = READ_ONCE(x); -18 rcu_read_unlock(); -19 } -</pre> -</blockquote> - -<p> -After <tt>thread0()</tt> and <tt>thread1()</tt> execute -concurrently, it is quite possible to have - -<blockquote> -<pre> -(r1 == 1 && r2 == 0) -</pre> -</blockquote> - -(that is, <tt>y</tt> appears to have been assigned before <tt>x</tt>), -which would not be possible if <tt>rcu_read_lock()</tt> and -<tt>rcu_read_unlock()</tt> had much in the way of ordering -properties. -But they do not, so the CPU is within its rights -to do significant reordering. -This is by design: Any significant ordering constraints would slow down -these fast-path APIs. - -<p>@@QQ@@ -Can't the compiler also reorder this code? -<p>@@QQA@@ -No, the volatile casts in <tt>READ_ONCE()</tt> and -<tt>WRITE_ONCE()</tt> prevent the compiler from reordering in -this particular case. -<p>@@QQE@@ - -<h3><a name="Readers Do Not Exclude Updaters">Readers Do Not Exclude Updaters</a></h3> - -<p> -Neither <tt>rcu_read_lock()</tt> nor <tt>rcu_read_unlock()</tt> -exclude updates. -All they do is to prevent grace periods from ending. -The following example illustrates this: - -<blockquote> -<pre> - 1 void thread0(void) - 2 { - 3 rcu_read_lock(); - 4 r1 = READ_ONCE(y); - 5 if (r1) { - 6 do_something_with_nonzero_x(); - 7 r2 = READ_ONCE(x); - 8 WARN_ON(!r2); /* BUG!!! */ - 9 } -10 rcu_read_unlock(); -11 } -12 -13 void thread1(void) -14 { -15 spin_lock(&my_lock); -16 WRITE_ONCE(x, 1); -17 WRITE_ONCE(y, 1); -18 spin_unlock(&my_lock); -19 } -</pre> -</blockquote> - -<p> -If the <tt>thread0()</tt> function's <tt>rcu_read_lock()</tt> -excluded the <tt>thread1()</tt> function's update, -the <tt>WARN_ON()</tt> could never fire. -But the fact is that <tt>rcu_read_lock()</tt> does not exclude -much of anything aside from subsequent grace periods, of which -<tt>thread1()</tt> has none, so the -<tt>WARN_ON()</tt> can and does fire. - -<h3><a name="Updaters Only Wait For Old Readers">Updaters Only Wait For Old Readers</a></h3> - -<p> -It might be tempting to assume that after <tt>synchronize_rcu()</tt> -completes, there are no readers executing. -This temptation must be avoided because -new readers can start immediately after <tt>synchronize_rcu()</tt> -starts, and <tt>synchronize_rcu()</tt> is under no -obligation to wait for these new readers. - -<p>@@QQ@@ -Suppose that synchronize_rcu() did wait until all readers had completed. -Would the updater be able to rely on this? -<p>@@QQA@@ -No. -Even if <tt>synchronize_rcu()</tt> were to wait until -all readers had completed, a new reader might start immediately after -<tt>synchronize_rcu()</tt> completed. -Therefore, the code following -<tt>synchronize_rcu()</tt> cannot rely on there being no readers -in any case. -<p>@@QQE@@ - -<h3><a name="Grace Periods Don't Partition Read-Side Critical Sections"> -Grace Periods Don't Partition Read-Side Critical Sections</a></h3> - -<p> -It is tempting to assume that if any part of one RCU read-side critical -section precedes a given grace period, and if any part of another RCU -read-side critical section follows that same grace period, then all of -the first RCU read-side critical section must precede all of the second. -However, this just isn't the case: A single grace period does not -partition the set of RCU read-side critical sections. -An example of this situation can be illustrated as follows, where -<tt>x</tt>, <tt>y</tt>, and <tt>z</tt> are initially all zero: - -<blockquote> -<pre> - 1 void thread0(void) - 2 { - 3 rcu_read_lock(); - 4 WRITE_ONCE(a, 1); - 5 WRITE_ONCE(b, 1); - 6 rcu_read_unlock(); - 7 } - 8 - 9 void thread1(void) -10 { -11 r1 = READ_ONCE(a); -12 synchronize_rcu(); -13 WRITE_ONCE(c, 1); -14 } -15 -16 void thread2(void) -17 { -18 rcu_read_lock(); -19 r2 = READ_ONCE(b); -20 r3 = READ_ONCE(c); -21 rcu_read_unlock(); -22 } -</pre> -</blockquote> - -<p> -It turns out that the outcome: - -<blockquote> -<pre> -(r1 == 1 && r2 == 0 && r3 == 1) -</pre> -</blockquote> - -is entirely possible. -The following figure show how this can happen, with each circled -<tt>QS</tt> indicating the point at which RCU recorded a -<i>quiescent state</i> for each thread, that is, a state in which -RCU knows that the thread cannot be in the midst of an RCU read-side -critical section that started before the current grace period: - -<p><img src="GPpartitionReaders1.svg" alt="GPpartitionReaders1.svg" width="60%"></p> - -<p> -If it is necessary to partition RCU read-side critical sections in this -manner, it is necessary to use two grace periods, where the first -grace period is known to end before the second grace period starts: - -<blockquote> -<pre> - 1 void thread0(void) - 2 { - 3 rcu_read_lock(); - 4 WRITE_ONCE(a, 1); - 5 WRITE_ONCE(b, 1); - 6 rcu_read_unlock(); - 7 } - 8 - 9 void thread1(void) -10 { -11 r1 = READ_ONCE(a); -12 synchronize_rcu(); -13 WRITE_ONCE(c, 1); -14 } -15 -16 void thread2(void) -17 { -18 r2 = READ_ONCE(c); -19 synchronize_rcu(); -20 WRITE_ONCE(d, 1); -21 } -22 -23 void thread3(void) -24 { -25 rcu_read_lock(); -26 r3 = READ_ONCE(b); -27 r4 = READ_ONCE(d); -28 rcu_read_unlock(); -29 } -</pre> -</blockquote> - -<p> -Here, if <tt>(r1 == 1)</tt>, then -<tt>thread0()</tt>'s write to <tt>b</tt> must happen -before the end of <tt>thread1()</tt>'s grace period. -If in addition <tt>(r4 == 1)</tt>, then -<tt>thread3()</tt>'s read from <tt>b</tt> must happen -after the beginning of <tt>thread2()</tt>'s grace period. -If it is also the case that <tt>(r2 == 1)</tt>, then the -end of <tt>thread1()</tt>'s grace period must precede the -beginning of <tt>thread2()</tt>'s grace period. -This mean that the two RCU read-side critical sections cannot overlap, -guaranteeing that <tt>(r3 == 1)</tt>. -As a result, the outcome: - -<blockquote> -<pre> -(r1 == 1 && r2 == 1 && r3 == 0 && r4 == 1) -</pre> -</blockquote> - -cannot happen. - -<p> -This non-requirement was also non-premeditated, but became apparent -when studying RCU's interaction with memory ordering. - -<h3><a name="Read-Side Critical Sections Don't Partition Grace Periods"> -Read-Side Critical Sections Don't Partition Grace Periods</a></h3> - -<p> -It is also tempting to assume that if an RCU read-side critical section -happens between a pair of grace periods, then those grace periods cannot -overlap. -However, this temptation leads nowhere good, as can be illustrated by -the following, with all variables initially zero: - -<blockquote> -<pre> - 1 void thread0(void) - 2 { - 3 rcu_read_lock(); - 4 WRITE_ONCE(a, 1); - 5 WRITE_ONCE(b, 1); - 6 rcu_read_unlock(); - 7 } - 8 - 9 void thread1(void) -10 { -11 r1 = READ_ONCE(a); -12 synchronize_rcu(); -13 WRITE_ONCE(c, 1); -14 } -15 -16 void thread2(void) -17 { -18 rcu_read_lock(); -19 WRITE_ONCE(d, 1); -20 r2 = READ_ONCE(c); -21 rcu_read_unlock(); -22 } -23 -24 void thread3(void) -25 { -26 r3 = READ_ONCE(d); -27 synchronize_rcu(); -28 WRITE_ONCE(e, 1); -29 } -30 -31 void thread4(void) -32 { -33 rcu_read_lock(); -34 r4 = READ_ONCE(b); -35 r5 = READ_ONCE(e); -36 rcu_read_unlock(); -37 } -</pre> -</blockquote> - -<p> -In this case, the outcome: - -<blockquote> -<pre> -(r1 == 1 && r2 == 1 && r3 == 1 && r4 == 0 && r5 == 1) -</pre> -</blockquote> - -is entirely possible, as illustrated below: - -<p><img src="ReadersPartitionGP1.svg" alt="ReadersPartitionGP1.svg" width="100%"></p> - -<p> -Again, an RCU read-side critical section can overlap almost all of a -given grace period, just so long as it does not overlap the entire -grace period. -As a result, an RCU read-side critical section cannot partition a pair -of RCU grace periods. - -<p>@@QQ@@ -How long a sequence of grace periods, each separated by an RCU read-side -critical section, would be required to partition the RCU read-side -critical sections at the beginning and end of the chain? -<p>@@QQA@@ -In theory, an infinite number. -In practice, an unknown number that is sensitive to both implementation -details and timing considerations. -Therefore, even in practice, RCU users must abide by the theoretical rather -than the practical answer. -<p>@@QQE@@ - -<h3><a name="Disabling Preemption Does Not Block Grace Periods"> -Disabling Preemption Does Not Block Grace Periods</a></h3> - -<p> -There was a time when disabling preemption on any given CPU would block -subsequent grace periods. -However, this was an accident of implementation and is not a requirement. -And in the current Linux-kernel implementation, disabling preemption -on a given CPU in fact does not block grace periods, as Oleg Nesterov -<a href="https://lkml.kernel.org/g/20150614193825.GA19582@redhat.com">demonstrated</a>. - -<p> -If you need a preempt-disable region to block grace periods, you need to add -<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>, for example -as follows: - -<blockquote> -<pre> - 1 preempt_disable(); - 2 rcu_read_lock(); - 3 do_something(); - 4 rcu_read_unlock(); - 5 preempt_enable(); - 6 - 7 /* Spinlocks implicitly disable preemption. */ - 8 spin_lock(&mylock); - 9 rcu_read_lock(); -10 do_something(); -11 rcu_read_unlock(); -12 spin_unlock(&mylock); -</pre> -</blockquote> - -<p> -In theory, you could enter the RCU read-side critical section first, -but it is more efficient to keep the entire RCU read-side critical -section contained in the preempt-disable region as shown above. -Of course, RCU read-side critical sections that extend outside of -preempt-disable regions will work correctly, but such critical sections -can be preempted, which forces <tt>rcu_read_unlock()</tt> to do -more work. -And no, this is <i>not</i> an invitation to enclose all of your RCU -read-side critical sections within preempt-disable regions, because -doing so would degrade real-time response. - -<p> -This non-requirement appeared with preemptible RCU. -If you need a grace period that waits on non-preemptible code regions, use -<a href="#Sched Flavor">RCU-sched</a>. - -<h2><a name="Parallelism Facts of Life">Parallelism Facts of Life</a></h2> - -<p> -These parallelism facts of life are by no means specific to RCU, but -the RCU implementation must abide by them. -They therefore bear repeating: - -<ol> -<li> Any CPU or task may be delayed at any time, - and any attempts to avoid these delays by disabling - preemption, interrupts, or whatever are completely futile. - This is most obvious in preemptible user-level - environments and in virtualized environments (where - a given guest OS's VCPUs can be preempted at any time by - the underlying hypervisor), but can also happen in bare-metal - environments due to ECC errors, NMIs, and other hardware - events. - Although a delay of more than about 20 seconds can result - in splats, the RCU implementation is obligated to use - algorithms that can tolerate extremely long delays, but where - “extremely long” is not long enough to allow - wrap-around when incrementing a 64-bit counter. -<li> Both the compiler and the CPU can reorder memory accesses. - Where it matters, RCU must use compiler directives and - memory-barrier instructions to preserve ordering. -<li> Conflicting writes to memory locations in any given cache line - will result in expensive cache misses. - Greater numbers of concurrent writes and more-frequent - concurrent writes will result in more dramatic slowdowns. - RCU is therefore obligated to use algorithms that have - sufficient locality to avoid significant performance and - scalability problems. -<li> As a rough rule of thumb, only one CPU's worth of processing - may be carried out under the protection of any given exclusive - lock. - RCU must therefore use scalable locking designs. -<li> Counters are finite, especially on 32-bit systems. - RCU's use of counters must therefore tolerate counter wrap, - or be designed such that counter wrap would take way more - time than a single system is likely to run. - An uptime of ten years is quite possible, a runtime - of a century much less so. - As an example of the latter, RCU's dyntick-idle nesting counter - allows 54 bits for interrupt nesting level (this counter - is 64 bits even on a 32-bit system). - Overflowing this counter requires 2<sup>54</sup> - half-interrupts on a given CPU without that CPU ever going idle. - If a half-interrupt happened every microsecond, it would take - 570 years of runtime to overflow this counter, which is currently - believed to be an acceptably long time. -<li> Linux systems can have thousands of CPUs running a single - Linux kernel in a single shared-memory environment. - RCU must therefore pay close attention to high-end scalability. -</ol> - -<p> -This last parallelism fact of life means that RCU must pay special -attention to the preceding facts of life. -The idea that Linux might scale to systems with thousands of CPUs would -have been met with some skepticism in the 1990s, but these requirements -would have otherwise have been unsurprising, even in the early 1990s. - -<h2><a name="Quality-of-Implementation Requirements">Quality-of-Implementation Requirements</a></h2> - -<p> -These sections list quality-of-implementation requirements. -Although an RCU implementation that ignores these requirements could -still be used, it would likely be subject to limitations that would -make it inappropriate for industrial-strength production use. -Classes of quality-of-implementation requirements are as follows: - -<ol> -<li> <a href="#Specialization">Specialization</a> -<li> <a href="#Performance and Scalability">Performance and Scalability</a> -<li> <a href="#Composability">Composability</a> -<li> <a href="#Corner Cases">Corner Cases</a> -</ol> - -<p> -These classes is covered in the following sections. - -<h3><a name="Specialization">Specialization</a></h3> - -<p> -RCU is and always has been intended primarily for read-mostly situations, as -illustrated by the following figure. -This means that RCU's read-side primitives are optimized, often at the -expense of its update-side primitives. - -<p><img src="RCUApplicability.svg" alt="RCUApplicability.svg" width="70%"></p> - -<p> -This focus on read-mostly situations means that RCU must interoperate -with other synchronization primitives. -For example, the <tt>add_gp()</tt> and <tt>remove_gp_synchronous()</tt> -examples discussed earlier use RCU to protect readers and locking to -coordinate updaters. -However, the need extends much farther, requiring that a variety of -synchronization primitives be legal within RCU read-side critical sections, -including spinlocks, sequence locks, atomic operations, reference -counters, and memory barriers. - -<p>@@QQ@@ -What about sleeping locks? -<p>@@QQA@@ -These are forbidden within Linux-kernel RCU read-side critical sections -because it is not legal to place a quiescent state (in this case, -voluntary context switch) within an RCU read-side critical section. -However, sleeping locks may be used within userspace RCU read-side critical -sections, and also within Linux-kernel sleepable RCU -<a href="#Sleepable RCU">(SRCU)</a> -read-side critical sections. -In addition, the -rt patchset turns spinlocks into a sleeping locks so -that the corresponding critical sections can be preempted, which -also means that these sleeplockified spinlocks (but not other sleeping locks!) -may be acquire within -rt-Linux-kernel RCU read-side critical sections. - -<p> -Note that it <i>is</i> legal for a normal RCU read-side critical section -to conditionally acquire a sleeping locks (as in <tt>mutex_trylock()</tt>), -but only as long as it does not loop indefinitely attempting to -conditionally acquire that sleeping locks. -The key point is that things like <tt>mutex_trylock()</tt> -either return with the mutex held, or return an error indication if -the mutex was not immediately available. -Either way, <tt>mutex_trylock()</tt> returns immediately without sleeping. -<p>@@QQE@@ - -<p> -It often comes as a surprise that many algorithms do not require a -consistent view of data, but many can function in that mode, -with network routing being the poster child. -Internet routing algorithms take significant time to propagate -updates, so that by the time an update arrives at a given system, -that system has been sending network traffic the wrong way for -a considerable length of time. -Having a few threads continue to send traffic the wrong way for a -few more milliseconds is clearly not a problem: In the worst case, -TCP retransmissions will eventually get the data where it needs to go. -In general, when tracking the state of the universe outside of the -computer, some level of inconsistency must be tolerated due to -speed-of-light delays if nothing else. - -<p> -Furthermore, uncertainty about external state is inherent in many cases. -For example, a pair of veternarians might use heartbeat to determine -whether or not a given cat was alive. -But how long should they wait after the last heartbeat to decide that -the cat is in fact dead? -Waiting less than 400 milliseconds makes no sense because this would -mean that a relaxed cat would be considered to cycle between death -and life more than 100 times per minute. -Moreover, just as with human beings, a cat's heart might stop for -some period of time, so the exact wait period is a judgment call. -One of our pair of veternarians might wait 30 seconds before pronouncing -the cat dead, while the other might insist on waiting a full minute. -The two veternarians would then disagree on the state of the cat during -the final 30 seconds of the minute following the last heartbeat, as -fancifully illustrated below: - -<p><img src="2013-08-is-it-dead.png" alt="2013-08-is-it-dead.png" width="431"></p> - -<p> -Interestingly enough, this same situation applies to hardware. -When push comes to shove, how do we tell whether or not some -external server has failed? -We send messages to it periodically, and declare it failed if we -don't receive a response within a given period of time. -Policy decisions can usually tolerate short -periods of inconsistency. -The policy was decided some time ago, and is only now being put into -effect, so a few milliseconds of delay is normally inconsequential. - -<p> -However, there are algorithms that absolutely must see consistent data. -For example, the translation between a user-level SystemV semaphore -ID to the corresponding in-kernel data structure is protected by RCU, -but it is absolutely forbidden to update a semaphore that has just been -removed. -In the Linux kernel, this need for consistency is accommodated by acquiring -spinlocks located in the in-kernel data structure from within -the RCU read-side critical section, and this is indicated by the -green box in the figure above. -Many other techniques may be used, and are in fact used within the -Linux kernel. - -<p> -In short, RCU is not required to maintain consistency, and other -mechanisms may be used in concert with RCU when consistency is required. -RCU's specialization allows it to do its job extremely well, and its -ability to interoperate with other synchronization mechanisms allows -the right mix of synchronization tools to be used for a given job. - -<h3><a name="Performance and Scalability">Performance and Scalability</a></h3> - -<p> -Energy efficiency is a critical component of performance today, -and Linux-kernel RCU implementations must therefore avoid unnecessarily -awakening idle CPUs. -I cannot claim that this requirement was premeditated. -In fact, I learned of it during a telephone conversation in which I -was given “frank and open” feedback on the importance -of energy efficiency in battery-powered systems and on specific -energy-efficiency shortcomings of the Linux-kernel RCU implementation. -In my experience, the battery-powered embedded community will consider -any unnecessary wakeups to be extremely unfriendly acts. -So much so that mere Linux-kernel-mailing-list posts are -insufficient to vent their ire. - -<p> -Memory consumption is not particularly important for in most -situations, and has become decreasingly -so as memory sizes have expanded and memory -costs have plummeted. -However, as I learned from Matt Mackall's -<a href="http://elinux.org/Linux_Tiny-FAQ">bloatwatch</a> -efforts, memory footprint is critically important on single-CPU systems with -non-preemptible (<tt>CONFIG_PREEMPT=n</tt>) kernels, and thus -<a href="https://lkml.kernel.org/g/20090113221724.GA15307@linux.vnet.ibm.com">tiny RCU</a> -was born. -Josh Triplett has since taken over the small-memory banner with his -<a href="https://tiny.wiki.kernel.org/">Linux kernel tinification</a> -project, which resulted in -<a href="#Sleepable RCU">SRCU</a> -becoming optional for those kernels not needing it. - -<p> -The remaining performance requirements are, for the most part, -unsurprising. -For example, in keeping with RCU's read-side specialization, -<tt>rcu_dereference()</tt> should have negligible overhead (for -example, suppression of a few minor compiler optimizations). -Similarly, in non-preemptible environments, <tt>rcu_read_lock()</tt> and -<tt>rcu_read_unlock()</tt> should have exactly zero overhead. - -<p> -In preemptible environments, in the case where the RCU read-side -critical section was not preempted (as will be the case for the -highest-priority real-time process), <tt>rcu_read_lock()</tt> and -<tt>rcu_read_unlock()</tt> should have minimal overhead. -In particular, they should not contain atomic read-modify-write -operations, memory-barrier instructions, preemption disabling, -interrupt disabling, or backwards branches. -However, in the case where the RCU read-side critical section was preempted, -<tt>rcu_read_unlock()</tt> may acquire spinlocks and disable interrupts. -This is why it is better to nest an RCU read-side critical section -within a preempt-disable region than vice versa, at least in cases -where that critical section is short enough to avoid unduly degrading -real-time latencies. - -<p> -The <tt>synchronize_rcu()</tt> grace-period-wait primitive is -optimized for throughput. -It may therefore incur several milliseconds of latency in addition to -the duration of the longest RCU read-side critical section. -On the other hand, multiple concurrent invocations of -<tt>synchronize_rcu()</tt> are required to use batching optimizations -so that they can be satisfied by a single underlying grace-period-wait -operation. -For example, in the Linux kernel, it is not unusual for a single -grace-period-wait operation to serve more than -<a href="https://www.usenix.org/conference/2004-usenix-annual-technical-conference/making-rcu-safe-deep-sub-millisecond-response">1,000 separate invocations</a> -of <tt>synchronize_rcu()</tt>, thus amortizing the per-invocation -overhead down to nearly zero. -However, the grace-period optimization is also required to avoid -measurable degradation of real-time scheduling and interrupt latencies. - -<p> -In some cases, the multi-millisecond <tt>synchronize_rcu()</tt> -latencies are unacceptable. -In these cases, <tt>synchronize_rcu_expedited()</tt> may be used -instead, reducing the grace-period latency down to a few tens of -microseconds on small systems, at least in cases where the RCU read-side -critical sections are short. -There are currently no special latency requirements for -<tt>synchronize_rcu_expedited()</tt> on large systems, but, -consistent with the empirical nature of the RCU specification, -that is subject to change. -However, there most definitely are scalability requirements: -A storm of <tt>synchronize_rcu_expedited()</tt> invocations on 4096 -CPUs should at least make reasonable forward progress. -In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt> -is permitted to impose modest degradation of real-time latency -on non-idle online CPUs. -That said, it will likely be necessary to take further steps to reduce this -degradation, hopefully to roughly that of a scheduling-clock interrupt. - -<p> -There are a number of situations where even -<tt>synchronize_rcu_expedited()</tt>'s reduced grace-period -latency is unacceptable. -In these situations, the asynchronous <tt>call_rcu()</tt> can be -used in place of <tt>synchronize_rcu()</tt> as follows: - -<blockquote> -<pre> - 1 struct foo { - 2 int a; - 3 int b; - 4 struct rcu_head rh; - 5 }; - 6 - 7 static void remove_gp_cb(struct rcu_head *rhp) - 8 { - 9 struct foo *p = container_of(rhp, struct foo, rh); -10 -11 kfree(p); -12 } -13 -14 bool remove_gp_asynchronous(void) -15 { -16 struct foo *p; -17 -18 spin_lock(&gp_lock); -19 p = rcu_dereference(gp); -20 if (!p) { -21 spin_unlock(&gp_lock); -22 return false; -23 } -24 rcu_assign_pointer(gp, NULL); -25 call_rcu(&p->rh, remove_gp_cb); -26 spin_unlock(&gp_lock); -27 return true; -28 } -</pre> -</blockquote> - -<p> -A definition of <tt>struct foo</tt> is finally needed, and appears -on lines 1-5. -The function <tt>remove_gp_cb()</tt> is passed to <tt>call_rcu()</tt> -on line 25, and will be invoked after the end of a subsequent -grace period. -This gets the same effect as <tt>remove_gp_synchronous()</tt>, -but without forcing the updater to wait for a grace period to elapse. -The <tt>call_rcu()</tt> function may be used in a number of -situations where neither <tt>synchronize_rcu()</tt> nor -<tt>synchronize_rcu_expedited()</tt> would be legal, -including within preempt-disable code, <tt>local_bh_disable()</tt> code, -interrupt-disable code, and interrupt handlers. -However, even <tt>call_rcu()</tt> is illegal within NMI handlers. -The callback function (<tt>remove_gp_cb()</tt> in this case) will be -executed within softirq (software interrupt) environment within the -Linux kernel, -either within a real softirq handler or under the protection -of <tt>local_bh_disable()</tt>. -In both the Linux kernel and in userspace, it is bad practice to -write an RCU callback function that takes too long. -Long-running operations should be relegated to separate threads or -(in the Linux kernel) workqueues. - -<p>@@QQ@@ -Why does line 19 use <tt>rcu_access_pointer()</tt>? -After all, <tt>call_rcu()</tt> on line 25 stores into the -structure, which would interact badly with concurrent insertions. -Doesn't this mean that <tt>rcu_dereference()</tt> is required? -<p>@@QQA@@ -Presumably the <tt>->gp_lock</tt> acquired on line 18 excludes -any changes, including any insertions that <tt>rcu_dereference()</tt> -would protect against. -Therefore, any insertions will be delayed until after <tt>->gp_lock</tt> -is released on line 25, which in turn means that -<tt>rcu_access_pointer()</tt> suffices. -<p>@@QQE@@ - -<p> -However, all that <tt>remove_gp_cb()</tt> is doing is -invoking <tt>kfree()</tt> on the data element. -This is a common idiom, and is supported by <tt>kfree_rcu()</tt>, -which allows “fire and forget” operation as shown below: - -<blockquote> -<pre> - 1 struct foo { - 2 int a; - 3 int b; - 4 struct rcu_head rh; - 5 }; - 6 - 7 bool remove_gp_faf(void) - 8 { - 9 struct foo *p; -10 -11 spin_lock(&gp_lock); -12 p = rcu_dereference(gp); -13 if (!p) { -14 spin_unlock(&gp_lock); -15 return false; -16 } -17 rcu_assign_pointer(gp, NULL); -18 kfree_rcu(p, rh); -19 spin_unlock(&gp_lock); -20 return true; -21 } -</pre> -</blockquote> - -<p> -Note that <tt>remove_gp_faf()</tt> simply invokes -<tt>kfree_rcu()</tt> and proceeds, without any need to pay any -further attention to the subsequent grace period and <tt>kfree()</tt>. -It is permissible to invoke <tt>kfree_rcu()</tt> from the same -environments as for <tt>call_rcu()</tt>. -Interestingly enough, DYNIX/ptx had the equivalents of -<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>, but not -<tt>synchronize_rcu()</tt>. -This was due to the fact that RCU was not heavily used within DYNIX/ptx, -so the very few places that needed something like -<tt>synchronize_rcu()</tt> simply open-coded it. - -<p>@@QQ@@ -Earlier it was claimed that <tt>call_rcu()</tt> and -<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked -by readers. -But how can that be correct, given that the invocation of the callback -and the freeing of the memory (respectively) must still wait for -a grace period to elapse? -<p>@@QQA@@ -We could define things this way, but keep in mind that this sort of -definition would say that updates in garbage-collected languages -cannot complete until the next time the garbage collector runs, -which does not seem at all reasonable. -The key point is that in most cases, an updater using either -<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the -next update as soon as it has invoked <tt>call_rcu()</tt> or -<tt>kfree_rcu()</tt>, without having to wait for a subsequent -grace period. -<p>@@QQE@@ - -<p> -But what if the updater must wait for the completion of code to be -executed after the end of the grace period, but has other tasks -that can be carried out in the meantime? -The polling-style <tt>get_state_synchronize_rcu()</tt> and -<tt>cond_synchronize_rcu()</tt> functions may be used for this -purpose, as shown below: - -<blockquote> -<pre> - 1 bool remove_gp_poll(void) - 2 { - 3 struct foo *p; - 4 unsigned long s; - 5 - 6 spin_lock(&gp_lock); - 7 p = rcu_access_pointer(gp); - 8 if (!p) { - 9 spin_unlock(&gp_lock); -10 return false; -11 } -12 rcu_assign_pointer(gp, NULL); -13 spin_unlock(&gp_lock); -14 s = get_state_synchronize_rcu(); -15 do_something_while_waiting(); -16 cond_synchronize_rcu(s); -17 kfree(p); -18 return true; -19 } -</pre> -</blockquote> - -<p> -On line 14, <tt>get_state_synchronize_rcu()</tt> obtains a -“cookie” from RCU, -then line 15 carries out other tasks, -and finally, line 16 returns immediately if a grace period has -elapsed in the meantime, but otherwise waits as required. -The need for <tt>get_state_synchronize_rcu</tt> and -<tt>cond_synchronize_rcu()</tt> has appeared quite recently, -so it is too early to tell whether they will stand the test of time. - -<p> -RCU thus provides a range of tools to allow updaters to strike the -required tradeoff between latency, flexibility and CPU overhead. - -<h3><a name="Composability">Composability</a></h3> - -<p> -Composability has received much attention in recent years, perhaps in part -due to the collision of multicore hardware with object-oriented techniques -designed in single-threaded environments for single-threaded use. -And in theory, RCU read-side critical sections may be composed, and in -fact may be nested arbitrarily deeply. -In practice, as with all real-world implementations of composable -constructs, there are limitations. - -<p> -Implementations of RCU for which <tt>rcu_read_lock()</tt> -and <tt>rcu_read_unlock()</tt> generate no code, such as -Linux-kernel RCU when <tt>CONFIG_PREEMPT=n</tt>, can be -nested arbitrarily deeply. -After all, there is no overhead. -Except that if all these instances of <tt>rcu_read_lock()</tt> -and <tt>rcu_read_unlock()</tt> are visible to the compiler, -compilation will eventually fail due to exhausting memory, -mass storage, or user patience, whichever comes first. -If the nesting is not visible to the compiler, as is the case with -mutually recursive functions each in its own translation unit, -stack overflow will result. -If the nesting takes the form of loops, either the control variable -will overflow or (in the Linux kernel) you will get an RCU CPU stall warning. -Nevertheless, this class of RCU implementations is one -of the most composable constructs in existence. - -<p> -RCU implementations that explicitly track nesting depth -are limited by the nesting-depth counter. -For example, the Linux kernel's preemptible RCU limits nesting to -<tt>INT_MAX</tt>. -This should suffice for almost all practical purposes. -That said, a consecutive pair of RCU read-side critical sections -between which there is an operation that waits for a grace period -cannot be enclosed in another RCU read-side critical section. -This is because it is not legal to wait for a grace period within -an RCU read-side critical section: To do so would result either -in deadlock or -in RCU implicitly splitting the enclosing RCU read-side critical -section, neither of which is conducive to a long-lived and prosperous -kernel. - -<p> -It is worth noting that RCU is not alone in limiting composability. -For example, many transactional-memory implementations prohibit -composing a pair of transactions separated by an irrevocable -operation (for example, a network receive operation). -For another example, lock-based critical sections can be composed -surprisingly freely, but only if deadlock is avoided. - -<p> -In short, although RCU read-side critical sections are highly composable, -care is required in some situations, just as is the case for any other -composable synchronization mechanism. - -<h3><a name="Corner Cases">Corner Cases</a></h3> - -<p> -A given RCU workload might have an endless and intense stream of -RCU read-side critical sections, perhaps even so intense that there -was never a point in time during which there was not at least one -RCU read-side critical section in flight. -RCU cannot allow this situation to block grace periods: As long as -all the RCU read-side critical sections are finite, grace periods -must also be finite. - -<p> -That said, preemptible RCU implementations could potentially result -in RCU read-side critical sections being preempted for long durations, -which has the effect of creating a long-duration RCU read-side -critical section. -This situation can arise only in heavily loaded systems, but systems using -real-time priorities are of course more vulnerable. -Therefore, RCU priority boosting is provided to help deal with this -case. -That said, the exact requirements on RCU priority boosting will likely -evolve as more experience accumulates. - -<p> -Other workloads might have very high update rates. -Although one can argue that such workloads should instead use -something other than RCU, the fact remains that RCU must -handle such workloads gracefully. -This requirement is another factor driving batching of grace periods, -but it is also the driving force behind the checks for large numbers -of queued RCU callbacks in the <tt>call_rcu()</tt> code path. -Finally, high update rates should not delay RCU read-side critical -sections, although some read-side delays can occur when using -<tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use -of <tt>try_stop_cpus()</tt>. -(In the future, <tt>synchronize_rcu_expedited()</tt> will be -converted to use lighter-weight inter-processor interrupts (IPIs), -but this will still disturb readers, though to a much smaller degree.) - -<p> -Although all three of these corner cases were understood in the early -1990s, a simple user-level test consisting of <tt>close(open(path))</tt> -in a tight loop -in the early 2000s suddenly provided a much deeper appreciation of the -high-update-rate corner case. -This test also motivated addition of some RCU code to react to high update -rates, for example, if a given CPU finds itself with more than 10,000 -RCU callbacks queued, it will cause RCU to take evasive action by -more aggressively starting grace periods and more aggressively forcing -completion of grace-period processing. -This evasive action causes the grace period to complete more quickly, -but at the cost of restricting RCU's batching optimizations, thus -increasing the CPU overhead incurred by that grace period. - -<h2><a name="Software-Engineering Requirements"> -Software-Engineering Requirements</a></h2> - -<p> -Between Murphy's Law and “To err is human”, it is necessary to -guard against mishaps and misuse: - -<ol> -<li> It is all too easy to forget to use <tt>rcu_read_lock()</tt> - everywhere that it is needed, so kernels built with - <tt>CONFIG_PROVE_RCU=y</tt> will spat if - <tt>rcu_dereference()</tt> is used outside of an - RCU read-side critical section. - Update-side code can use <tt>rcu_dereference_protected()</tt>, - which takes a - <a href="https://lwn.net/Articles/371986/">lockdep expression</a> - to indicate what is providing the protection. - If the indicated protection is not provided, a lockdep splat - is emitted. - - <p> - Code shared between readers and updaters can use - <tt>rcu_dereference_check()</tt>, which also takes a - lockdep expression, and emits a lockdep splat if neither - <tt>rcu_read_lock()</tt> nor the indicated protection - is in place. - In addition, <tt>rcu_dereference_raw()</tt> is used in those - (hopefully rare) cases where the required protection cannot - be easily described. - Finally, <tt>rcu_read_lock_held()</tt> is provided to - allow a function to verify that it has been invoked within - an RCU read-side critical section. - I was made aware of this set of requirements shortly after Thomas - Gleixner audited a number of RCU uses. -<li> A given function might wish to check for RCU-related preconditions - upon entry, before using any other RCU API. - The <tt>rcu_lockdep_assert()</tt> does this job, - asserting the expression in kernels having lockdep enabled - and doing nothing otherwise. -<li> It is also easy to forget to use <tt>rcu_assign_pointer()</tt> - and <tt>rcu_dereference()</tt>, perhaps (incorrectly) - substituting a simple assignment. - To catch this sort of error, a given RCU-protected pointer may be - tagged with <tt>__rcu</tt>, after which running sparse - with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt> will complain - about simple-assignment accesses to that pointer. - Arnd Bergmann made me aware of this requirement, and also - supplied the needed - <a href="https://lwn.net/Articles/376011/">patch series</a>. -<li> Kernels built with <tt>CONFIG_DEBUG_OBJECTS_RCU_HEAD=y</tt> - will splat if a data element is passed to <tt>call_rcu()</tt> - twice in a row, without a grace period in between. - (This error is similar to a double free.) - The corresponding <tt>rcu_head</tt> structures that are - dynamically allocated are automatically tracked, but - <tt>rcu_head</tt> structures allocated on the stack - must be initialized with <tt>init_rcu_head_on_stack()</tt> - and cleaned up with <tt>destroy_rcu_head_on_stack()</tt>. - Similarly, statically allocated non-stack <tt>rcu_head</tt> - structures must be initialized with <tt>init_rcu_head()</tt> - and cleaned up with <tt>destroy_rcu_head()</tt>. - Mathieu Desnoyers made me aware of this requirement, and also - supplied the needed - <a href="https://lkml.kernel.org/g/20100319013024.GA28456@Krystal">patch</a>. -<li> An infinite loop in an RCU read-side critical section will - eventually trigger an RCU CPU stall warning splat, with - the duration of “eventually” being controlled by the - <tt>RCU_CPU_STALL_TIMEOUT</tt> <tt>Kconfig</tt> option, or, - alternatively, by the - <tt>rcupdate.rcu_cpu_stall_timeout</tt> boot/sysfs - parameter. - However, RCU is not obligated to produce this splat - unless there is a grace period waiting on that particular - RCU read-side critical section. - <p> - Some extreme workloads might intentionally delay - RCU grace periods, and systems running those workloads can - be booted with <tt>rcupdate.rcu_cpu_stall_suppress</tt> - to suppress the splats. - This kernel parameter may also be set via <tt>sysfs</tt>. - Furthermore, RCU CPU stall warnings are counter-productive - during sysrq dumps and during panics. - RCU therefore supplies the <tt>rcu_sysrq_start()</tt> and - <tt>rcu_sysrq_end()</tt> API members to be called before - and after long sysrq dumps. - RCU also supplies the <tt>rcu_panic()</tt> notifier that is - automatically invoked at the beginning of a panic to suppress - further RCU CPU stall warnings. - - <p> - This requirement made itself known in the early 1990s, pretty - much the first time that it was necessary to debug a CPU stall. - That said, the initial implementation in DYNIX/ptx was quite - generic in comparison with that of Linux. -<li> Although it would be very good to detect pointers leaking out - of RCU read-side critical sections, there is currently no - good way of doing this. - One complication is the need to distinguish between pointers - leaking and pointers that have been handed off from RCU to - some other synchronization mechanism, for example, reference - counting. -<li> In kernels built with <tt>CONFIG_RCU_TRACE=y</tt>, RCU-related - information is provided via both debugfs and event tracing. -<li> Open-coded use of <tt>rcu_assign_pointer()</tt> and - <tt>rcu_dereference()</tt> to create typical linked - data structures can be surprisingly error-prone. - Therefore, RCU-protected - <a href="https://lwn.net/Articles/609973/#RCU List APIs">linked lists</a> - and, more recently, RCU-protected - <a href="https://lwn.net/Articles/612100/">hash tables</a> - are available. - Many other special-purpose RCU-protected data structures are - available in the Linux kernel and the userspace RCU library. -<li> Some linked structures are created at compile time, but still - require <tt>__rcu</tt> checking. - The <tt>RCU_POINTER_INITIALIZER()</tt> macro serves this - purpose. -<li> It is not necessary to use <tt>rcu_assign_pointer()</tt> - when creating linked structures that are to be published via - a single external pointer. - The <tt>RCU_INIT_POINTER()</tt> macro is provided for - this task and also for assigning <tt>NULL</tt> pointers - at runtime. -</ol> - -<p> -This not a hard-and-fast list: RCU's diagnostic capabilities will -continue to be guided by the number and type of usage bugs found -in real-world RCU usage. - -<h2><a name="Linux Kernel Complications">Linux Kernel Complications</a></h2> - -<p> -The Linux kernel provides an interesting environment for all kinds of -software, including RCU. -Some of the relevant points of interest are as follows: - -<ol> -<li> <a href="#Configuration">Configuration</a>. -<li> <a href="#Firmware Interface">Firmware Interface</a>. -<li> <a href="#Early Boot">Early Boot</a>. -<li> <a href="#Interrupts and NMIs"> - Interrupts and non-maskable interrupts (NMIs)</a>. -<li> <a href="#Loadable Modules">Loadable Modules</a>. -<li> <a href="#Hotplug CPU">Hotplug CPU</a>. -<li> <a href="#Scheduler and RCU">Scheduler and RCU</a>. -<li> <a href="#Tracing and RCU">Tracing and RCU</a>. -<li> <a href="#Energy Efficiency">Energy Efficiency</a>. -<li> <a href="#Memory Efficiency">Memory Efficiency</a>. -<li> <a href="#Performance, Scalability, Response Time, and Reliability"> - Performance, Scalability, Response Time, and Reliability</a>. -</ol> - -<p> -This list is probably incomplete, but it does give a feel for the -most notable Linux-kernel complications. -Each of the following sections covers one of the above topics. - -<h3><a name="Configuration">Configuration</a></h3> - -<p> -RCU's goal is automatic configuration, so that almost nobody -needs to worry about RCU's <tt>Kconfig</tt> options. -And for almost all users, RCU does in fact work well -“out of the box.” - -<p> -However, there are specialized use cases that are handled by -kernel boot parameters and <tt>Kconfig</tt> options. -Unfortunately, the <tt>Kconfig</tt> system will explicitly ask users -about new <tt>Kconfig</tt> options, which requires almost all of them -be hidden behind a <tt>CONFIG_RCU_EXPERT</tt> <tt>Kconfig</tt> option. - -<p> -This all should be quite obvious, but the fact remains that -Linus Torvalds recently had to -<a href="https://lkml.kernel.org/g/CA+55aFy4wcCwaL4okTs8wXhGZ5h-ibecy_Meg9C4MNQrUnwMcg@mail.gmail.com">remind</a> -me of this requirement. - -<h3><a name="Firmware Interface">Firmware Interface</a></h3> - -<p> -In many cases, kernel obtains information about the system from the -firmware, and sometimes things are lost in translation. -Or the translation is accurate, but the original message is bogus. - -<p> -For example, some systems' firmware overreports the number of CPUs, -sometimes by a large factor. -If RCU naively believed the firmware, as it used to do, -it would create too many per-CPU kthreads. -Although the resulting system will still run correctly, the extra -kthreads needlessly consume memory and can cause confusion -when they show up in <tt>ps</tt> listings. - -<p> -RCU must therefore wait for a given CPU to actually come online before -it can allow itself to believe that the CPU actually exists. -The resulting “ghost CPUs” (which are never going to -come online) cause a number of -<a href="https://paulmck.livejournal.com/37494.html">interesting complications</a>. - -<h3><a name="Early Boot">Early Boot</a></h3> - -<p> -The Linux kernel's boot sequence is an interesting process, -and RCU is used early, even before <tt>rcu_init()</tt> -is invoked. -In fact, a number of RCU's primitives can be used as soon as the -initial task's <tt>task_struct</tt> is available and the -boot CPU's per-CPU variables are set up. -The read-side primitives (<tt>rcu_read_lock()</tt>, -<tt>rcu_read_unlock()</tt>, <tt>rcu_dereference()</tt>, -and <tt>rcu_access_pointer()</tt>) will operate normally very early on, -as will <tt>rcu_assign_pointer()</tt>. - -<p> -Although <tt>call_rcu()</tt> may be invoked at any -time during boot, callbacks are not guaranteed to be invoked until after -the scheduler is fully up and running. -This delay in callback invocation is due to the fact that RCU does not -invoke callbacks until it is fully initialized, and this full initialization -cannot occur until after the scheduler has initialized itself to the -point where RCU can spawn and run its kthreads. -In theory, it would be possible to invoke callbacks earlier, -however, this is not a panacea because there would be severe restrictions -on what operations those callbacks could invoke. - -<p> -Perhaps surprisingly, <tt>synchronize_rcu()</tt>, -<a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a> -(<a href="#Bottom-Half Flavor">discussed below</a>), -and -<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a> -will all operate normally -during very early boot, the reason being that there is only one CPU -and preemption is disabled. -This means that the call <tt>synchronize_rcu()</tt> (or friends) -itself is a quiescent -state and thus a grace period, so the early-boot implementation can -be a no-op. - -<p> -Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt> -continue to operate normally through the remainder of boot, courtesy -of the fact that preemption is disabled across their RCU read-side -critical sections and also courtesy of the fact that there is still -only one CPU. -However, once the scheduler starts initializing, preemption is enabled. -There is still only a single CPU, but the fact that preemption is enabled -means that the no-op implementation of <tt>synchronize_rcu()</tt> no -longer works in <tt>CONFIG_PREEMPT=y</tt> kernels. -Therefore, as soon as the scheduler starts initializing, the early-boot -fastpath is disabled. -This means that <tt>synchronize_rcu()</tt> switches to its runtime -mode of operation where it posts callbacks, which in turn means that -any call to <tt>synchronize_rcu()</tt> will block until the corresponding -callback is invoked. -Unfortunately, the callback cannot be invoked until RCU's runtime -grace-period machinery is up and running, which cannot happen until -the scheduler has initialized itself sufficiently to allow RCU's -kthreads to be spawned. -Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler -initialization can result in deadlock. - -<p>@@QQ@@ -So what happens with <tt>synchronize_rcu()</tt> during -scheduler initialization for <tt>CONFIG_PREEMPT=n</tt> -kernels? -<p>@@QQA@@ -In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt> -maps directly to <tt>synchronize_sched()</tt>. -Therefore, <tt>synchronize_rcu()</tt> works normally throughout -boot in <tt>CONFIG_PREEMPT=n</tt> kernels. -However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels, -so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt> -during scheduler initialization. -<p>@@QQE@@ - -<p> -I learned of these boot-time requirements as a result of a series of -system hangs. - -<h3><a name="Interrupts and NMIs">Interrupts and NMIs</a></h3> - -<p> -The Linux kernel has interrupts, and RCU read-side critical sections are -legal within interrupt handlers and within interrupt-disabled regions -of code, as are invocations of <tt>call_rcu()</tt>. - -<p> -Some Linux-kernel architectures can enter an interrupt handler from -non-idle process context, and then just never leave it, instead stealthily -transitioning back to process context. -This trick is sometimes used to invoke system calls from inside the kernel. -These “half-interrupts” mean that RCU has to be very careful -about how it counts interrupt nesting levels. -I learned of this requirement the hard way during a rewrite -of RCU's dyntick-idle code. - -<p> -The Linux kernel has non-maskable interrupts (NMIs), and -RCU read-side critical sections are legal within NMI handlers. -Thankfully, RCU update-side primitives, including -<tt>call_rcu()</tt>, are prohibited within NMI handlers. - -<p> -The name notwithstanding, some Linux-kernel architectures -can have nested NMIs, which RCU must handle correctly. -Andy Lutomirski -<a href="https://lkml.kernel.org/g/CALCETrXLq1y7e_dKFPgou-FKHB6Pu-r8+t-6Ds+8=va7anBWDA@mail.gmail.com">surprised me</a> -with this requirement; -he also kindly surprised me with -<a href="https://lkml.kernel.org/g/CALCETrXSY9JpW3uE6H8WYk81sg56qasA2aqmjMPsq5dOtzso=g@mail.gmail.com">an algorithm</a> -that meets this requirement. - -<h3><a name="Loadable Modules">Loadable Modules</a></h3> - -<p> -The Linux kernel has loadable modules, and these modules can -also be unloaded. -After a given module has been unloaded, any attempt to call -one of its functions results in a segmentation fault. -The module-unload functions must therefore cancel any -delayed calls to loadable-module functions, for example, -any outstanding <tt>mod_timer()</tt> must be dealt with -via <tt>del_timer_sync()</tt> or similar. - -<p> -Unfortunately, there is no way to cancel an RCU callback; -once you invoke <tt>call_rcu()</tt>, the callback function is -going to eventually be invoked, unless the system goes down first. -Because it is normally considered socially irresponsible to crash the system -in response to a module unload request, we need some other way -to deal with in-flight RCU callbacks. - -<p> -RCU therefore provides -<tt><a href="https://lwn.net/Articles/217484/">rcu_barrier()</a></tt>, -which waits until all in-flight RCU callbacks have been invoked. -If a module uses <tt>call_rcu()</tt>, its exit function should therefore -prevent any future invocation of <tt>call_rcu()</tt>, then invoke -<tt>rcu_barrier()</tt>. -In theory, the underlying module-unload code could invoke -<tt>rcu_barrier()</tt> unconditionally, but in practice this would -incur unacceptable latencies. - -<p> -Nikita Danilov noted this requirement for an analogous filesystem-unmount -situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU. -The need for <tt>rcu_barrier()</tt> for module unloading became -apparent later. - -<h3><a name="Hotplug CPU">Hotplug CPU</a></h3> - -<p> -The Linux kernel supports CPU hotplug, which means that CPUs -can come and go. -It is of course illegal to use any RCU API member from an offline CPU. -This requirement was present from day one in DYNIX/ptx, but -on the other hand, the Linux kernel's CPU-hotplug implementation -is “interesting.” - -<p> -The Linux-kernel CPU-hotplug implementation has notifiers that -are used to allow the various kernel subsystems (including RCU) -to respond appropriately to a given CPU-hotplug operation. -Most RCU operations may be invoked from CPU-hotplug notifiers, -including even normal synchronous grace-period operations -such as <tt>synchronize_rcu()</tt>. -However, expedited grace-period operations such as -<tt>synchronize_rcu_expedited()</tt> are not supported, -due to the fact that current implementations block CPU-hotplug -operations, which could result in deadlock. - -<p> -In addition, all-callback-wait operations such as -<tt>rcu_barrier()</tt> are also not supported, due to the -fact that there are phases of CPU-hotplug operations where -the outgoing CPU's callbacks will not be invoked until after -the CPU-hotplug operation ends, which could also result in deadlock. - -<h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3> - -<p> -RCU depends on the scheduler, and the scheduler uses RCU to -protect some of its data structures. -This means the scheduler is forbidden from acquiring -the runqueue locks and the priority-inheritance locks -in the middle of an outermost RCU read-side critical section unless either -(1) it releases them before exiting that same -RCU read-side critical section, or -(2) interrupts are disabled across -that entire RCU read-side critical section. -This same prohibition also applies (recursively!) to any lock that is acquired -while holding any lock to which this prohibition applies. -Adhering to this rule prevents preemptible RCU from invoking -<tt>rcu_read_unlock_special()</tt> while either runqueue or -priority-inheritance locks are held, thus avoiding deadlock. - -<p> -Prior to v4.4, it was only necessary to disable preemption across -RCU read-side critical sections that acquired scheduler locks. -In v4.4, expedited grace periods started using IPIs, and these -IPIs could force a <tt>rcu_read_unlock()</tt> to take the slowpath. -Therefore, this expedited-grace-period change required disabling of -interrupts, not just preemption. - -<p> -For RCU's part, the preemptible-RCU <tt>rcu_read_unlock()</tt> -implementation must be written carefully to avoid similar deadlocks. -In particular, <tt>rcu_read_unlock()</tt> must tolerate an -interrupt where the interrupt handler invokes both -<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>. -This possibility requires <tt>rcu_read_unlock()</tt> to use -negative nesting levels to avoid destructive recursion via -interrupt handler's use of RCU. - -<p> -This pair of mutual scheduler-RCU requirements came as a -<a href="https://lwn.net/Articles/453002/">complete surprise</a>. - -<p> -As noted above, RCU makes use of kthreads, and it is necessary to -avoid excessive CPU-time accumulation by these kthreads. -This requirement was no surprise, but RCU's violation of it -when running context-switch-heavy workloads when built with -<tt>CONFIG_NO_HZ_FULL=y</tt> -<a href="http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf">did come as a surprise [PDF]</a>. -RCU has made good progress towards meeting this requirement, even -for context-switch-have <tt>CONFIG_NO_HZ_FULL=y</tt> workloads, -but there is room for further improvement. - -<h3><a name="Tracing and RCU">Tracing and RCU</a></h3> - -<p> -It is possible to use tracing on RCU code, but tracing itself -uses RCU. -For this reason, <tt>rcu_dereference_raw_notrace()</tt> -is provided for use by tracing, which avoids the destructive -recursion that could otherwise ensue. -This API is also used by virtualization in some architectures, -where RCU readers execute in environments in which tracing -cannot be used. -The tracing folks both located the requirement and provided the -needed fix, so this surprise requirement was relatively painless. - -<h3><a name="Energy Efficiency">Energy Efficiency</a></h3> - -<p> -Interrupting idle CPUs is considered socially unacceptable, -especially by people with battery-powered embedded systems. -RCU therefore conserves energy by detecting which CPUs are -idle, including tracking CPUs that have been interrupted from idle. -This is a large part of the energy-efficiency requirement, -so I learned of this via an irate phone call. - -<p> -Because RCU avoids interrupting idle CPUs, it is illegal to -execute an RCU read-side critical section on an idle CPU. -(Kernels built with <tt>CONFIG_PROVE_RCU=y</tt> will splat -if you try it.) -The <tt>RCU_NONIDLE()</tt> macro and <tt>_rcuidle</tt> -event tracing is provided to work around this restriction. -In addition, <tt>rcu_is_watching()</tt> may be used to -test whether or not it is currently legal to run RCU read-side -critical sections on this CPU. -I learned of the need for diagnostics on the one hand -and <tt>RCU_NONIDLE()</tt> on the other while inspecting -idle-loop code. -Steven Rostedt supplied <tt>_rcuidle</tt> event tracing, -which is used quite heavily in the idle loop. - -<p> -It is similarly socially unacceptable to interrupt an -<tt>nohz_full</tt> CPU running in userspace. -RCU must therefore track <tt>nohz_full</tt> userspace -execution. -And in -<a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a> -kernels, RCU must separately track idle CPUs on the one hand and -CPUs that are either idle or executing in userspace on the other. -In both cases, RCU must be able to sample state at two points in -time, and be able to determine whether or not some other CPU spent -any time idle and/or executing in userspace. - -<p> -These energy-efficiency requirements have proven quite difficult to -understand and to meet, for example, there have been more than five -clean-sheet rewrites of RCU's energy-efficiency code, the last of -which was finally able to demonstrate -<a href="http://www.rdrop.com/users/paulmck/realtime/paper/AMPenergy.2013.04.19a.pdf">real energy savings running on real hardware [PDF]</a>. -As noted earlier, -I learned of many of these requirements via angry phone calls: -Flaming me on the Linux-kernel mailing list was apparently not -sufficient to fully vent their ire at RCU's energy-efficiency bugs! - -<h3><a name="Memory Efficiency">Memory Efficiency</a></h3> - -<p> -Although small-memory non-realtime systems can simply use Tiny RCU, -code size is only one aspect of memory efficiency. -Another aspect is the size of the <tt>rcu_head</tt> structure -used by <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>. -Although this structure contains nothing more than a pair of pointers, -it does appear in many RCU-protected data structures, including -some that are size critical. -The <tt>page</tt> structure is a case in point, as evidenced by -the many occurrences of the <tt>union</tt> keyword within that structure. - -<p> -This need for memory efficiency is one reason that RCU uses hand-crafted -singly linked lists to track the <tt>rcu_head</tt> structures that -are waiting for a grace period to elapse. -It is also the reason why <tt>rcu_head</tt> structures do not contain -debug information, such as fields tracking the file and line of the -<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> that posted them. -Although this information might appear in debug-only kernel builds at some -point, in the meantime, the <tt>->func</tt> field will often provide -the needed debug information. - -<p> -However, in some cases, the need for memory efficiency leads to even -more extreme measures. -Returning to the <tt>page</tt> structure, the <tt>rcu_head</tt> field -shares storage with a great many other structures that are used at -various points in the corresponding page's lifetime. -In order to correctly resolve certain -<a href="https://lkml.kernel.org/g/1439976106-137226-1-git-send-email-kirill.shutemov@linux.intel.com">race conditions</a>, -the Linux kernel's memory-management subsystem needs a particular bit -to remain zero during all phases of grace-period processing, -and that bit happens to map to the bottom bit of the -<tt>rcu_head</tt> structure's <tt>->next</tt> field. -RCU makes this guarantee as long as <tt>call_rcu()</tt> -is used to post the callback, as opposed to <tt>kfree_rcu()</tt> -or some future “lazy” -variant of <tt>call_rcu()</tt> that might one day be created for -energy-efficiency purposes. - -<h3><a name="Performance, Scalability, Response Time, and Reliability"> -Performance, Scalability, Response Time, and Reliability</a></h3> - -<p> -Expanding on the -<a href="#Performance and Scalability">earlier discussion</a>, -RCU is used heavily by hot code paths in performance-critical -portions of the Linux kernel's networking, security, virtualization, -and scheduling code paths. -RCU must therefore use efficient implementations, especially in its -read-side primitives. -To that end, it would be good if preemptible RCU's implementation -of <tt>rcu_read_lock()</tt> could be inlined, however, doing -this requires resolving <tt>#include</tt> issues with the -<tt>task_struct</tt> structure. - -<p> -The Linux kernel supports hardware configurations with up to -4096 CPUs, which means that RCU must be extremely scalable. -Algorithms that involve frequent acquisitions of global locks or -frequent atomic operations on global variables simply cannot be -tolerated within the RCU implementation. -RCU therefore makes heavy use of a combining tree based on the -<tt>rcu_node</tt> structure. -RCU is required to tolerate all CPUs continuously invoking any -combination of RCU's runtime primitives with minimal per-operation -overhead. -In fact, in many cases, increasing load must <i>decrease</i> the -per-operation overhead, witness the batching optimizations for -<tt>synchronize_rcu()</tt>, <tt>call_rcu()</tt>, -<tt>synchronize_rcu_expedited()</tt>, and <tt>rcu_barrier()</tt>. -As a general rule, RCU must cheerfully accept whatever the -rest of the Linux kernel decides to throw at it. - -<p> -The Linux kernel is used for real-time workloads, especially -in conjunction with the -<a href="https://rt.wiki.kernel.org/index.php/Main_Page">-rt patchset</a>. -The real-time-latency response requirements are such that the -traditional approach of disabling preemption across RCU -read-side critical sections is inappropriate. -Kernels built with <tt>CONFIG_PREEMPT=y</tt> therefore -use an RCU implementation that allows RCU read-side critical -sections to be preempted. -This requirement made its presence known after users made it -clear that an earlier -<a href="https://lwn.net/Articles/107930/">real-time patch</a> -did not meet their needs, in conjunction with some -<a href="https://lkml.kernel.org/g/20050318002026.GA2693@us.ibm.com">RCU issues</a> -encountered by a very early version of the -rt patchset. - -<p> -In addition, RCU must make do with a sub-100-microsecond real-time latency -budget. -In fact, on smaller systems with the -rt patchset, the Linux kernel -provides sub-20-microsecond real-time latencies for the whole kernel, -including RCU. -RCU's scalability and latency must therefore be sufficient for -these sorts of configurations. -To my surprise, the sub-100-microsecond real-time latency budget -<a href="http://www.rdrop.com/users/paulmck/realtime/paper/bigrt.2013.01.31a.LCA.pdf"> -applies to even the largest systems [PDF]</a>, -up to and including systems with 4096 CPUs. -This real-time requirement motivated the grace-period kthread, which -also simplified handling of a number of race conditions. - -<p> -Finally, RCU's status as a synchronization primitive means that -any RCU failure can result in arbitrary memory corruption that can be -extremely difficult to debug. -This means that RCU must be extremely reliable, which in -practice also means that RCU must have an aggressive stress-test -suite. -This stress-test suite is called <tt>rcutorture</tt>. - -<p> -Although the need for <tt>rcutorture</tt> was no surprise, -the current immense popularity of the Linux kernel is posing -interesting—and perhaps unprecedented—validation -challenges. -To see this, keep in mind that there are well over one billion -instances of the Linux kernel running today, given Android -smartphones, Linux-powered televisions, and servers. -This number can be expected to increase sharply with the advent of -the celebrated Internet of Things. - -<p> -Suppose that RCU contains a race condition that manifests on average -once per million years of runtime. -This bug will be occurring about three times per <i>day</i> across -the installed base. -RCU could simply hide behind hardware error rates, given that no one -should really expect their smartphone to last for a million years. -However, anyone taking too much comfort from this thought should -consider the fact that in most jurisdictions, a successful multi-year -test of a given mechanism, which might include a Linux kernel, -suffices for a number of types of safety-critical certifications. -In fact, rumor has it that the Linux kernel is already being used -in production for safety-critical applications. -I don't know about you, but I would feel quite bad if a bug in RCU -killed someone. -Which might explain my recent focus on validation and verification. - -<h2><a name="Other RCU Flavors">Other RCU Flavors</a></h2> - -<p> -One of the more surprising things about RCU is that there are now -no fewer than five <i>flavors</i>, or API families. -In addition, the primary flavor that has been the sole focus up to -this point has two different implementations, non-preemptible and -preemptible. -The other four flavors are listed below, with requirements for each -described in a separate section. - -<ol> -<li> <a href="#Bottom-Half Flavor">Bottom-Half Flavor</a> -<li> <a href="#Sched Flavor">Sched Flavor</a> -<li> <a href="#Sleepable RCU">Sleepable RCU</a> -<li> <a href="#Tasks RCU">Tasks RCU</a> -</ol> - -<h3><a name="Bottom-Half Flavor">Bottom-Half Flavor</a></h3> - -<p> -The softirq-disable (AKA “bottom-half”, -hence the “_bh” abbreviations) -flavor of RCU, or <i>RCU-bh</i>, was developed by -Dipankar Sarma to provide a flavor of RCU that could withstand the -network-based denial-of-service attacks researched by Robert -Olsson. -These attacks placed so much networking load on the system -that some of the CPUs never exited softirq execution, -which in turn prevented those CPUs from ever executing a context switch, -which, in the RCU implementation of that time, prevented grace periods -from ever ending. -The result was an out-of-memory condition and a system hang. - -<p> -The solution was the creation of RCU-bh, which does -<tt>local_bh_disable()</tt> -across its read-side critical sections, and which uses the transition -from one type of softirq processing to another as a quiescent state -in addition to context switch, idle, user mode, and offline. -This means that RCU-bh grace periods can complete even when some of -the CPUs execute in softirq indefinitely, thus allowing algorithms -based on RCU-bh to withstand network-based denial-of-service attacks. - -<p> -Because -<tt>rcu_read_lock_bh()</tt> and <tt>rcu_read_unlock_bh()</tt> -disable and re-enable softirq handlers, any attempt to start a softirq -handlers during the -RCU-bh read-side critical section will be deferred. -In this case, <tt>rcu_read_unlock_bh()</tt> -will invoke softirq processing, which can take considerable time. -One can of course argue that this softirq overhead should be associated -with the code following the RCU-bh read-side critical section rather -than <tt>rcu_read_unlock_bh()</tt>, but the fact -is that most profiling tools cannot be expected to make this sort -of fine distinction. -For example, suppose that a three-millisecond-long RCU-bh read-side -critical section executes during a time of heavy networking load. -There will very likely be an attempt to invoke at least one softirq -handler during that three milliseconds, but any such invocation will -be delayed until the time of the <tt>rcu_read_unlock_bh()</tt>. -This can of course make it appear at first glance as if -<tt>rcu_read_unlock_bh()</tt> was executing very slowly. - -<p> -The -<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-bh API</a> -includes -<tt>rcu_read_lock_bh()</tt>, -<tt>rcu_read_unlock_bh()</tt>, -<tt>rcu_dereference_bh()</tt>, -<tt>rcu_dereference_bh_check()</tt>, -<tt>synchronize_rcu_bh()</tt>, -<tt>synchronize_rcu_bh_expedited()</tt>, -<tt>call_rcu_bh()</tt>, -<tt>rcu_barrier_bh()</tt>, and -<tt>rcu_read_lock_bh_held()</tt>. - -<h3><a name="Sched Flavor">Sched Flavor</a></h3> - -<p> -Before preemptible RCU, waiting for an RCU grace period had the -side effect of also waiting for all pre-existing interrupt -and NMI handlers. -However, there are legitimate preemptible-RCU implementations that -do not have this property, given that any point in the code outside -of an RCU read-side critical section can be a quiescent state. -Therefore, <i>RCU-sched</i> was created, which follows “classic” -RCU in that an RCU-sched grace period waits for for pre-existing -interrupt and NMI handlers. -In kernels built with <tt>CONFIG_PREEMPT=n</tt>, the RCU and RCU-sched -APIs have identical implementations, while kernels built with -<tt>CONFIG_PREEMPT=y</tt> provide a separate implementation for each. - -<p> -Note well that in <tt>CONFIG_PREEMPT=y</tt> kernels, -<tt>rcu_read_lock_sched()</tt> and <tt>rcu_read_unlock_sched()</tt> -disable and re-enable preemption, respectively. -This means that if there was a preemption attempt during the -RCU-sched read-side critical section, <tt>rcu_read_unlock_sched()</tt> -will enter the scheduler, with all the latency and overhead entailed. -Just as with <tt>rcu_read_unlock_bh()</tt>, this can make it look -as if <tt>rcu_read_unlock_sched()</tt> was executing very slowly. -However, the highest-priority task won't be preempted, so that task -will enjoy low-overhead <tt>rcu_read_unlock_sched()</tt> invocations. - -<p> -The -<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-sched API</a> -includes -<tt>rcu_read_lock_sched()</tt>, -<tt>rcu_read_unlock_sched()</tt>, -<tt>rcu_read_lock_sched_notrace()</tt>, -<tt>rcu_read_unlock_sched_notrace()</tt>, -<tt>rcu_dereference_sched()</tt>, -<tt>rcu_dereference_sched_check()</tt>, -<tt>synchronize_sched()</tt>, -<tt>synchronize_rcu_sched_expedited()</tt>, -<tt>call_rcu_sched()</tt>, -<tt>rcu_barrier_sched()</tt>, and -<tt>rcu_read_lock_sched_held()</tt>. -However, anything that disables preemption also marks an RCU-sched -read-side critical section, including -<tt>preempt_disable()</tt> and <tt>preempt_enable()</tt>, -<tt>local_irq_save()</tt> and <tt>local_irq_restore()</tt>, -and so on. - -<h3><a name="Sleepable RCU">Sleepable RCU</a></h3> - -<p> -For well over a decade, someone saying “I need to block within -an RCU read-side critical section” was a reliable indication -that this someone did not understand RCU. -After all, if you are always blocking in an RCU read-side critical -section, you can probably afford to use a higher-overhead synchronization -mechanism. -However, that changed with the advent of the Linux kernel's notifiers, -whose RCU read-side critical -sections almost never sleep, but sometimes need to. -This resulted in the introduction of -<a href="https://lwn.net/Articles/202847/">sleepable RCU</a>, -or <i>SRCU</i>. - -<p> -SRCU allows different domains to be defined, with each such domain -defined by an instance of an <tt>srcu_struct</tt> structure. -A pointer to this structure must be passed in to each SRCU function, -for example, <tt>synchronize_srcu(&ss)</tt>, where -<tt>ss</tt> is the <tt>srcu_struct</tt> structure. -The key benefit of these domains is that a slow SRCU reader in one -domain does not delay an SRCU grace period in some other domain. -That said, one consequence of these domains is that read-side code -must pass a “cookie” from <tt>srcu_read_lock()</tt> -to <tt>srcu_read_unlock()</tt>, for example, as follows: - -<blockquote> -<pre> - 1 int idx; - 2 - 3 idx = srcu_read_lock(&ss); - 4 do_something(); - 5 srcu_read_unlock(&ss, idx); -</pre> -</blockquote> - -<p> -As noted above, it is legal to block within SRCU read-side critical sections, -however, with great power comes great responsibility. -If you block forever in one of a given domain's SRCU read-side critical -sections, then that domain's grace periods will also be blocked forever. -Of course, one good way to block forever is to deadlock, which can -happen if any operation in a given domain's SRCU read-side critical -section can block waiting, either directly or indirectly, for that domain's -grace period to elapse. -For example, this results in a self-deadlock: - -<blockquote> -<pre> - 1 int idx; - 2 - 3 idx = srcu_read_lock(&ss); - 4 do_something(); - 5 synchronize_srcu(&ss); - 6 srcu_read_unlock(&ss, idx); -</pre> -</blockquote> - -<p> -However, if line 5 acquired a mutex that was held across -a <tt>synchronize_srcu()</tt> for domain <tt>ss</tt>, -deadlock would still be possible. -Furthermore, if line 5 acquired a mutex that was held across -a <tt>synchronize_srcu()</tt> for some other domain <tt>ss1</tt>, -and if an <tt>ss1</tt>-domain SRCU read-side critical section -acquired another mutex that was held across as <tt>ss</tt>-domain -<tt>synchronize_srcu()</tt>, -deadlock would again be possible. -Such a deadlock cycle could extend across an arbitrarily large number -of different SRCU domains. -Again, with great power comes great responsibility. - -<p> -Unlike the other RCU flavors, SRCU read-side critical sections can -run on idle and even offline CPUs. -This ability requires that <tt>srcu_read_lock()</tt> and -<tt>srcu_read_unlock()</tt> contain memory barriers, which means -that SRCU readers will run a bit slower than would RCU readers. -It also motivates the <tt>smp_mb__after_srcu_read_unlock()</tt> -API, which, in combination with <tt>srcu_read_unlock()</tt>, -guarantees a full memory barrier. - -<p> -The -<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a> -includes -<tt>srcu_read_lock()</tt>, -<tt>srcu_read_unlock()</tt>, -<tt>srcu_dereference()</tt>, -<tt>srcu_dereference_check()</tt>, -<tt>synchronize_srcu()</tt>, -<tt>synchronize_srcu_expedited()</tt>, -<tt>call_srcu()</tt>, -<tt>srcu_barrier()</tt>, and -<tt>srcu_read_lock_held()</tt>. -It also includes -<tt>DEFINE_SRCU()</tt>, -<tt>DEFINE_STATIC_SRCU()</tt>, and -<tt>init_srcu_struct()</tt> -APIs for defining and initializing <tt>srcu_struct</tt> structures. - -<h3><a name="Tasks RCU">Tasks RCU</a></h3> - -<p> -Some forms of tracing use “tramopolines” to handle the -binary rewriting required to install different types of probes. -It would be good to be able to free old trampolines, which sounds -like a job for some form of RCU. -However, because it is necessary to be able to install a trace -anywhere in the code, it is not possible to use read-side markers -such as <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>. -In addition, it does not work to have these markers in the trampoline -itself, because there would need to be instructions following -<tt>rcu_read_unlock()</tt>. -Although <tt>synchronize_rcu()</tt> would guarantee that execution -reached the <tt>rcu_read_unlock()</tt>, it would not be able to -guarantee that execution had completely left the trampoline. - -<p> -The solution, in the form of -<a href="https://lwn.net/Articles/607117/"><i>Tasks RCU</i></a>, -is to have implicit -read-side critical sections that are delimited by voluntary context -switches, that is, calls to <tt>schedule()</tt>, -<tt>cond_resched_rcu_qs()</tt>, and -<tt>synchronize_rcu_tasks()</tt>. -In addition, transitions to and from userspace execution also delimit -tasks-RCU read-side critical sections. - -<p> -The tasks-RCU API is quite compact, consisting only of -<tt>call_rcu_tasks()</tt>, -<tt>synchronize_rcu_tasks()</tt>, and -<tt>rcu_barrier_tasks()</tt>. - -<h2><a name="Possible Future Changes">Possible Future Changes</a></h2> - -<p> -One of the tricks that RCU uses to attain update-side scalability is -to increase grace-period latency with increasing numbers of CPUs. -If this becomes a serious problem, it will be necessary to rework the -grace-period state machine so as to avoid the need for the additional -latency. - -<p> -Expedited grace periods scan the CPUs, so their latency and overhead -increases with increasing numbers of CPUs. -If this becomes a serious problem on large systems, it will be necessary -to do some redesign to avoid this scalability problem. - -<p> -RCU disables CPU hotplug in a few places, perhaps most notably in the -expedited grace-period and <tt>rcu_barrier()</tt> operations. -If there is a strong reason to use expedited grace periods in CPU-hotplug -notifiers, it will be necessary to avoid disabling CPU hotplug. -This would introduce some complexity, so there had better be a <i>very</i> -good reason. - -<p> -The tradeoff between grace-period latency on the one hand and interruptions -of other CPUs on the other hand may need to be re-examined. -The desire is of course for zero grace-period latency as well as zero -interprocessor interrupts undertaken during an expedited grace period -operation. -While this ideal is unlikely to be achievable, it is quite possible that -further improvements can be made. - -<p> -The multiprocessor implementations of RCU use a combining tree that -groups CPUs so as to reduce lock contention and increase cache locality. -However, this combining tree does not spread its memory across NUMA -nodes nor does it align the CPU groups with hardware features such -as sockets or cores. -Such spreading and alignment is currently believed to be unnecessary -because the hotpath read-side primitives do not access the combining -tree, nor does <tt>call_rcu()</tt> in the common case. -If you believe that your architecture needs such spreading and alignment, -then your architecture should also benefit from the -<tt>rcutree.rcu_fanout_leaf</tt> boot parameter, which can be set -to the number of CPUs in a socket, NUMA node, or whatever. -If the number of CPUs is too large, use a fraction of the number of -CPUs. -If the number of CPUs is a large prime number, well, that certainly -is an “interesting” architectural choice! -More flexible arrangements might be considered, but only if -<tt>rcutree.rcu_fanout_leaf</tt> has proven inadequate, and only -if the inadequacy has been demonstrated by a carefully run and -realistic system-level workload. - -<p> -Please note that arrangements that require RCU to remap CPU numbers will -require extremely good demonstration of need and full exploration of -alternatives. - -<p> -There is an embarrassingly large number of flavors of RCU, and this -number has been increasing over time. -Perhaps it will be possible to combine some at some future date. - -<p> -RCU's various kthreads are reasonably recent additions. -It is quite likely that adjustments will be required to more gracefully -handle extreme loads. -It might also be necessary to be able to relate CPU utilization by -RCU's kthreads and softirq handlers to the code that instigated this -CPU utilization. -For example, RCU callback overhead might be charged back to the -originating <tt>call_rcu()</tt> instance, though probably not -in production kernels. - -<h2><a name="Summary">Summary</a></h2> - -<p> -This document has presented more than two decade's worth of RCU -requirements. -Given that the requirements keep changing, this will not be the last -word on this subject, but at least it serves to get an important -subset of the requirements set forth. - -<h2><a name="Acknowledgments">Acknowledgments</a></h2> - -I am grateful to Steven Rostedt, Lai Jiangshan, Ingo Molnar, -Oleg Nesterov, Borislav Petkov, Peter Zijlstra, Boqun Feng, and -Andy Lutomirski for their help in rendering -this article human readable, and to Michelle Rankin for her support -of this effort. -Other contributions are acknowledged in the Linux kernel's git archive. -The cartoon is copyright (c) 2013 by Melissa Broussard, -and is provided -under the terms of the Creative Commons Attribution-Share Alike 3.0 -United States license. - -<p>@@QQAL@@ - -</body></html> diff --git a/Documentation/RCU/Design/htmlqqz.sh b/Documentation/RCU/Design/htmlqqz.sh deleted file mode 100755 index d354f069559b..000000000000 --- a/Documentation/RCU/Design/htmlqqz.sh +++ /dev/null @@ -1,108 +0,0 @@ -#!/bin/sh -# -# Usage: sh htmlqqz.sh file -# -# Extracts and converts quick quizzes in a proto-HTML document file.htmlx. -# Commands, all of which must be on a line by themselves: -# -# "<p>@@QQ@@": Start of a quick quiz. -# "<p>@@QQA@@": Start of a quick-quiz answer. -# "<p>@@QQE@@": End of a quick-quiz answer, and thus of the quick quiz. -# "<p>@@QQAL@@": Place to put quick-quiz answer list. -# -# Places the result in file.html. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, you can access it online at -# http://www.gnu.org/licenses/gpl-2.0.html. -# -# Copyright (c) 2013 Paul E. McKenney, IBM Corporation. - -fn=$1 -if test ! -r $fn.htmlx -then - echo "Error: $fn.htmlx unreadable." - exit 1 -fi - -echo "<!-- DO NOT HAND EDIT. -->" > $fn.html -echo "<!-- Instead, edit $fn.htmlx and run 'sh htmlqqz.sh $fn' -->" >> $fn.html -awk < $fn.htmlx >> $fn.html ' - -state == "" && $1 != "<p>@@QQ@@" && $1 != "<p>@@QQAL@@" { - print $0; - if ($0 ~ /^<p>@@QQ/) - print "Bad Quick Quiz command: " NR " (expected <p>@@QQ@@ or <p>@@QQAL@@)." > "/dev/stderr" - next; -} - -state == "" && $1 == "<p>@@QQ@@" { - qqn++; - qqlineno = NR; - haveqq = 1; - state = "qq"; - print "<p><a name=\"Quick Quiz " qqn "\"><b>Quick Quiz " qqn "</b>:</a>" - next; -} - -state == "qq" && $1 != "<p>@@QQA@@" { - qq[qqn] = qq[qqn] $0 "\n"; - print $0 - if ($0 ~ /^<p>@@QQ/) - print "Bad Quick Quiz command: " NR ". (expected <p>@@QQA@@)" > "/dev/stderr" - next; -} - -state == "qq" && $1 == "<p>@@QQA@@" { - state = "qqa"; - print "<br><a href=\"#qq" qqn "answer\">Answer</a>" - next; -} - -state == "qqa" && $1 != "<p>@@QQE@@" { - qqa[qqn] = qqa[qqn] $0 "\n"; - if ($0 ~ /^<p>@@QQ/) - print "Bad Quick Quiz command: " NR " (expected <p>@@QQE@@)." > "/dev/stderr" - next; -} - -state == "qqa" && $1 == "<p>@@QQE@@" { - state = ""; - next; -} - -state == "" && $1 == "<p>@@QQAL@@" { - haveqq = ""; - print "<h3><a name=\"Answers to Quick Quizzes\">" - print "Answers to Quick Quizzes</a></h3>" - print ""; - for (i = 1; i <= qqn; i++) { - print "<a name=\"qq" i "answer\"></a>" - print "<p><b>Quick Quiz " i "</b>:" - print qq[i]; - print ""; - print "</p><p><b>Answer</b>:" - print qqa[i]; - print ""; - print "</p><p><a href=\"#Quick%20Quiz%20" i "\"><b>Back to Quick Quiz " i "</b>.</a>" - print ""; - } - next; -} - -END { - if (state != "") - print "Unterminated Quick Quiz: " qqlineno "." > "/dev/stderr" - else if (haveqq) - print "Missing \"<p>@@QQAL@@\", no Quick Quiz." > "/dev/stderr" -}' diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index ec6998b1b6d0..00a3a38b375a 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -237,17 +237,17 @@ o "ktl" is the low-order 16 bits (in hexadecimal) of the count of The output of "cat rcu/rcu_preempt/rcuexp" looks as follows: -s=21872 wd0=0 wd1=0 wd2=0 wd3=5 n=0 enq=0 sc=21872 +s=21872 wd1=0 wd2=0 wd3=5 n=0 enq=0 sc=21872 These fields are as follows: o "s" is the sequence number, with an odd number indicating that an expedited grace period is in progress. -o "wd0", "wd1", "wd2", and "wd3" are the number of times that an - attempt to start an expedited grace period found that someone - else had completed an expedited grace period that satisfies the - attempted request. "Our work is done." +o "wd1", "wd2", and "wd3" are the number of times that an attempt + to start an expedited grace period found that someone else had + completed an expedited grace period that satisfies the attempted + request. "Our work is done." o "n" is number of times that a concurrent CPU-hotplug operation forced a fallback to a normal grace period. diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index dc49c6712b17..111770ffa10e 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -681,22 +681,30 @@ Although RCU can be used in many different ways, a very common use of RCU is analogous to reader-writer locking. The following unified diff shows how closely related RCU and reader-writer locking can be. + @@ -5,5 +5,5 @@ struct el { + int data; + /* Other data fields */ + }; + -rwlock_t listmutex; + +spinlock_t listmutex; + struct el head; + @@ -13,15 +14,15 @@ struct list_head *lp; struct el *p; - - read_lock(); + - read_lock(&listmutex); - list_for_each_entry(p, head, lp) { + rcu_read_lock(); + list_for_each_entry_rcu(p, head, lp) { if (p->key == key) { *result = p->data; - - read_unlock(); + - read_unlock(&listmutex); + rcu_read_unlock(); return 1; } } - - read_unlock(); + - read_unlock(&listmutex); + rcu_read_unlock(); return 0; } @@ -732,7 +740,7 @@ Or, for those who prefer a side-by-side listing: 5 int data; 5 int data; 6 /* Other data fields */ 6 /* Other data fields */ 7 }; 7 }; - 8 spinlock_t listmutex; 8 spinlock_t listmutex; + 8 rwlock_t listmutex; 8 spinlock_t listmutex; 9 struct el head; 9 struct el head; 1 int search(long key, int *result) 1 int search(long key, int *result) @@ -740,15 +748,15 @@ Or, for those who prefer a side-by-side listing: 3 struct list_head *lp; 3 struct list_head *lp; 4 struct el *p; 4 struct el *p; 5 5 - 6 read_lock(); 6 rcu_read_lock(); + 6 read_lock(&listmutex); 6 rcu_read_lock(); 7 list_for_each_entry(p, head, lp) { 7 list_for_each_entry_rcu(p, head, lp) { 8 if (p->key == key) { 8 if (p->key == key) { 9 *result = p->data; 9 *result = p->data; -10 read_unlock(); 10 rcu_read_unlock(); +10 read_unlock(&listmutex); 10 rcu_read_unlock(); 11 return 1; 11 return 1; 12 } 12 } 13 } 13 } -14 read_unlock(); 14 rcu_read_unlock(); +14 read_unlock(&listmutex); 14 rcu_read_unlock(); 15 return 0; 15 return 0; 16 } 16 } diff --git a/Documentation/devicetree/bindings/regmap/regmap.txt b/Documentation/devicetree/bindings/regmap/regmap.txt index e98a9652ccc8..0127be360fe8 100644 --- a/Documentation/devicetree/bindings/regmap/regmap.txt +++ b/Documentation/devicetree/bindings/regmap/regmap.txt @@ -1,50 +1,29 @@ -Device-Tree binding for regmap - -The endianness mode of CPU & Device scenarios: -Index Device Endianness properties ---------------------------------------------------- -1 BE 'big-endian' -2 LE 'little-endian' -3 Native 'native-endian' - -For one device driver, which will run in different scenarios above -on different SoCs using the devicetree, we need one way to simplify -this. +Devicetree binding for regmap Optional properties: -- {big,little,native}-endian: these are boolean properties, if absent - then the implementation will choose a default based on the device - being controlled. These properties are for register values and all - the buffers only. Native endian means that the CPU and device have - the same endianness. -Examples: -Scenario 1 : CPU in LE mode & device in LE mode. -dev: dev@40031000 { - compatible = "name"; - reg = <0x40031000 0x1000>; - ... -}; + little-endian, + big-endian, + native-endian: See common-properties.txt for a definition -Scenario 2 : CPU in LE mode & device in BE mode. -dev: dev@40031000 { - compatible = "name"; - reg = <0x40031000 0x1000>; - ... - big-endian; -}; +Note: +Regmap defaults to little-endian register access on MMIO based +devices, this is by far the most common setting. On CPU +architectures that typically run big-endian operating systems +(e.g. PowerPC), registers can be defined as big-endian and must +be marked that way in the devicetree. -Scenario 3 : CPU in BE mode & device in BE mode. -dev: dev@40031000 { - compatible = "name"; - reg = <0x40031000 0x1000>; - ... -}; +On SoCs that can be operated in both big-endian and little-endian +modes, with a single hardware switch controlling both the endianess +of the CPU and a byteswap for MMIO registers (e.g. many Broadcom MIPS +chips), "native-endian" is used to allow using the same device tree +blob in both cases. -Scenario 4 : CPU in BE mode & device in LE mode. +Examples: +Scenario 1 : a register set in big-endian mode. dev: dev@40031000 { - compatible = "name"; + compatible = "syscon"; reg = <0x40031000 0x1000>; + big-endian; ... - little-endian; }; diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 0b3de80ec8f6..13f89d18bc25 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -131,6 +131,7 @@ parameter is applicable: More X86-64 boot options can be found in Documentation/x86/x86_64/boot-options.txt . X86 Either 32-bit or 64-bit x86 (same as X86-32+X86-64) + X86_UV SGI UV support is enabled. XEN Xen support is enabled In addition, the following text indicates that the option: @@ -542,6 +543,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Format: <int> (must be >=0) Default: 64 + bau= [X86_UV] Enable the BAU on SGI UV. The default + behavior is to disable the BAU (i.e. bau=0). + Format: { "0" | "1" } + 0 - Disable the BAU. + 1 - Enable the BAU. + unset - Disable the BAU. + baycom_epp= [HW,AX25] Format: <io>,<mode> @@ -3284,6 +3292,44 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Lazy RCU callbacks are those which RCU can prove do nothing more than free memory. + rcuperf.gp_exp= [KNL] + Measure performance of expedited synchronous + grace-period primitives. + + rcuperf.holdoff= [KNL] + Set test-start holdoff period. The purpose of + this parameter is to delay the start of the + test until boot completes in order to avoid + interference. + + rcuperf.nreaders= [KNL] + Set number of RCU readers. The value -1 selects + N, where N is the number of CPUs. A value + "n" less than -1 selects N-n+1, where N is again + the number of CPUs. For example, -2 selects N + (the number of CPUs), -3 selects N+1, and so on. + A value of "n" less than or equal to -N selects + a single reader. + + rcuperf.nwriters= [KNL] + Set number of RCU writers. The values operate + the same as for rcuperf.nreaders. + N, where N is the number of CPUs + + rcuperf.perf_runnable= [BOOT] + Start rcuperf running at boot time. + + rcuperf.shutdown= [KNL] + Shut the system down after performance tests + complete. This is useful for hands-off automated + testing. + + rcuperf.perf_type= [KNL] + Specify the RCU implementation to test. + + rcuperf.verbose= [KNL] + Enable additional printk() statements. + rcutorture.cbflood_inter_holdoff= [KNL] Set holdoff time (jiffies) between successive callback-flood tests. diff --git a/Documentation/locking/lockdep-design.txt b/Documentation/locking/lockdep-design.txt index 5001280e9d82..9de1c158d44c 100644 --- a/Documentation/locking/lockdep-design.txt +++ b/Documentation/locking/lockdep-design.txt @@ -97,7 +97,7 @@ between any two lock-classes: <hardirq-safe> -> <hardirq-unsafe> <softirq-safe> -> <softirq-unsafe> -The first rule comes from the fact the a hardirq-safe lock could be +The first rule comes from the fact that a hardirq-safe lock could be taken by a hardirq context, interrupting a hardirq-unsafe lock - and thus could result in a lock inversion deadlock. Likewise, a softirq-safe lock could be taken by an softirq context, interrupting a softirq-unsafe @@ -220,7 +220,7 @@ calculated, which hash is unique for every lock chain. The hash value, when the chain is validated for the first time, is then put into a hash table, which hash-table can be checked in a lockfree manner. If the locking chain occurs again later on, the hash table tells us that we -dont have to validate the chain again. +don't have to validate the chain again. Troubleshooting: ---------------- diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 3729cbe60e41..147ae8ec836f 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -4,8 +4,40 @@ By: David Howells <dhowells@redhat.com> Paul E. McKenney <paulmck@linux.vnet.ibm.com> + Will Deacon <will.deacon@arm.com> + Peter Zijlstra <peterz@infradead.org> -Contents: +========== +DISCLAIMER +========== + +This document is not a specification; it is intentionally (for the sake of +brevity) and unintentionally (due to being human) incomplete. This document is +meant as a guide to using the various memory barriers provided by Linux, but +in case of any doubt (and there are many) please ask. + +To repeat, this document is not a specification of what Linux expects from +hardware. + +The purpose of this document is twofold: + + (1) to specify the minimum functionality that one can rely on for any + particular barrier, and + + (2) to provide a guide as to how to use the barriers that are available. + +Note that an architecture can provide more than the minimum requirement +for any particular barrier, but if the architecure provides less than +that, that architecture is incorrect. + +Note also that it is possible that a barrier may be a no-op for an +architecture because the way that arch works renders an explicit barrier +unnecessary in that case. + + +======== +CONTENTS +======== (*) Abstract memory access model. @@ -31,15 +63,15 @@ Contents: (*) Implicit kernel memory barriers. - - Locking functions. + - Lock acquisition functions. - Interrupt disabling functions. - Sleep and wake-up functions. - Miscellaneous functions. - (*) Inter-CPU locking barrier effects. + (*) Inter-CPU acquiring barrier effects. - - Locks vs memory accesses. - - Locks vs I/O accesses. + - Acquires vs memory accesses. + - Acquires vs I/O accesses. (*) Where are memory barriers needed? @@ -61,6 +93,7 @@ Contents: (*) The things CPUs get up to. - And then there's the Alpha. + - Virtual Machine Guests. (*) Example uses. @@ -148,7 +181,7 @@ As a further example, consider this sequence of events: CPU 1 CPU 2 =============== =============== - { A == 1, B == 2, C = 3, P == &A, Q == &C } + { A == 1, B == 2, C == 3, P == &A, Q == &C } B = 4; Q = P; P = &B D = *Q; @@ -430,8 +463,9 @@ And a couple of implicit varieties: This acts as a one-way permeable barrier. It guarantees that all memory operations after the ACQUIRE operation will appear to happen after the ACQUIRE operation with respect to the other components of the system. - ACQUIRE operations include LOCK operations and smp_load_acquire() - operations. + ACQUIRE operations include LOCK operations and both smp_load_acquire() + and smp_cond_acquire() operations. The later builds the necessary ACQUIRE + semantics from relying on a control dependency and smp_rmb(). Memory operations that occur before an ACQUIRE operation may appear to happen after it completes. @@ -464,6 +498,11 @@ And a couple of implicit varieties: This means that ACQUIRE acts as a minimal "acquire" operation and RELEASE acts as a minimal "release" operation. +A subset of the atomic operations described in atomic_ops.txt have ACQUIRE +and RELEASE variants in addition to fully-ordered and relaxed (no barrier +semantics) definitions. For compound atomics performing both a load and a +store, ACQUIRE semantics apply only to the load and RELEASE semantics apply +only to the store portion of the operation. Memory barriers are only required where there's a possibility of interaction between two CPUs or between a CPU and a device. If it can be guaranteed that @@ -517,7 +556,7 @@ following sequence of events: CPU 1 CPU 2 =============== =============== - { A == 1, B == 2, C = 3, P == &A, Q == &C } + { A == 1, B == 2, C == 3, P == &A, Q == &C } B = 4; <write barrier> WRITE_ONCE(P, &B) @@ -544,7 +583,7 @@ between the address load and the data load: CPU 1 CPU 2 =============== =============== - { A == 1, B == 2, C = 3, P == &A, Q == &C } + { A == 1, B == 2, C == 3, P == &A, Q == &C } B = 4; <write barrier> WRITE_ONCE(P, &B); @@ -813,9 +852,10 @@ In summary: the same variable, then those stores must be ordered, either by preceding both of them with smp_mb() or by using smp_store_release() to carry out the stores. Please note that it is -not- sufficient - to use barrier() at beginning of each leg of the "if" statement, - as optimizing compilers do not necessarily respect barrier() - in this case. + to use barrier() at beginning of each leg of the "if" statement + because, as shown by the example above, optimizing compilers can + destroy the control dependency while respecting the letter of the + barrier() law. (*) Control dependencies require at least one run-time conditional between the prior load and the subsequent store, and this @@ -1731,15 +1771,15 @@ The Linux kernel has eight basic CPU memory barriers: All memory barriers except the data dependency barriers imply a compiler -barrier. Data dependencies do not impose any additional compiler ordering. +barrier. Data dependencies do not impose any additional compiler ordering. Aside: In the case of data dependencies, the compiler would be expected to issue the loads in the correct order (eg. `a[b]` would have to load the value of b before loading a[b]), however there is no guarantee in the C specification that the compiler may not speculate the value of b (eg. is equal to 1) and load a before b (eg. tmp = a[1]; if (b != 1) -tmp = a[b]; ). There is also the problem of a compiler reloading b after -having loaded a[b], thus having a newer copy of b than a[b]. A consensus +tmp = a[b]; ). There is also the problem of a compiler reloading b after +having loaded a[b], thus having a newer copy of b than a[b]. A consensus has not yet been reached about these problems, however the READ_ONCE() macro is a good place to start looking. @@ -1794,6 +1834,7 @@ There are some more advanced barrier functions: (*) lockless_dereference(); + This can be thought of as a pointer-fetch wrapper around the smp_read_barrier_depends() data-dependency barrier. @@ -1858,7 +1899,7 @@ This is a variation on the mandatory write barrier that causes writes to weakly ordered I/O regions to be partially ordered. Its effects may go beyond the CPU->Hardware interface and actually affect the hardware at some level. -See the subsection "Locks vs I/O accesses" for more information. +See the subsection "Acquires vs I/O accesses" for more information. =============================== @@ -1873,8 +1914,8 @@ provide more substantial guarantees, but these may not be relied upon outside of arch specific code. -ACQUIRING FUNCTIONS -------------------- +LOCK ACQUISITION FUNCTIONS +-------------------------- The Linux kernel has a number of locking constructs: @@ -1895,7 +1936,7 @@ for each construct. These operations all imply certain barriers: Memory operations issued before the ACQUIRE may be completed after the ACQUIRE operation has completed. An smp_mb__before_spinlock(), combined with a following ACQUIRE, orders prior stores against - subsequent loads and stores. Note that this is weaker than smp_mb()! + subsequent loads and stores. Note that this is weaker than smp_mb()! The smp_mb__before_spinlock() primitive is free on many architectures. (2) RELEASE operation implication: @@ -2090,9 +2131,9 @@ or: event_indicated = 1; wake_up_process(event_daemon); -A write memory barrier is implied by wake_up() and co. if and only if they wake -something up. The barrier occurs before the task state is cleared, and so sits -between the STORE to indicate the event and the STORE to set TASK_RUNNING: +A write memory barrier is implied by wake_up() and co. if and only if they +wake something up. The barrier occurs before the task state is cleared, and so +sits between the STORE to indicate the event and the STORE to set TASK_RUNNING: CPU 1 CPU 2 =============================== =============================== @@ -2206,7 +2247,7 @@ three CPUs; then should the following sequence of events occur: Then there is no guarantee as to what order CPU 3 will see the accesses to *A through *H occur in, other than the constraints imposed by the separate locks -on the separate CPUs. It might, for example, see: +on the separate CPUs. It might, for example, see: *E, ACQUIRE M, ACQUIRE Q, *G, *C, *F, *A, *B, RELEASE Q, *D, *H, RELEASE M @@ -2486,9 +2527,9 @@ The following operations are special locking primitives: clear_bit_unlock(); __clear_bit_unlock(); -These implement ACQUIRE-class and RELEASE-class operations. These should be used in -preference to other operations when implementing locking primitives, because -their implementations can be optimised on many architectures. +These implement ACQUIRE-class and RELEASE-class operations. These should be +used in preference to other operations when implementing locking primitives, +because their implementations can be optimised on many architectures. [!] Note that special memory barrier primitives are available for these situations because on some CPUs the atomic instructions used imply full memory @@ -2568,12 +2609,12 @@ explicit barriers are used. Normally this won't be a problem because the I/O accesses done inside such sections will include synchronous load operations on strictly ordered I/O -registers that form implicit I/O barriers. If this isn't sufficient then an +registers that form implicit I/O barriers. If this isn't sufficient then an mmiowb() may need to be used explicitly. A similar situation may occur between an interrupt routine and two routines -running on separate CPUs that communicate with each other. If such a case is +running on separate CPUs that communicate with each other. If such a case is likely, then interrupt-disabling locks should be used to guarantee ordering. @@ -2587,8 +2628,8 @@ functions: (*) inX(), outX(): These are intended to talk to I/O space rather than memory space, but - that's primarily a CPU-specific concept. The i386 and x86_64 processors do - indeed have special I/O space access cycles and instructions, but many + that's primarily a CPU-specific concept. The i386 and x86_64 processors + do indeed have special I/O space access cycles and instructions, but many CPUs don't have such a concept. The PCI bus, amongst others, defines an I/O space concept which - on such @@ -2610,7 +2651,7 @@ functions: Whether these are guaranteed to be fully ordered and uncombined with respect to each other on the issuing CPU depends on the characteristics - defined for the memory window through which they're accessing. On later + defined for the memory window through which they're accessing. On later i386 architecture machines, for example, this is controlled by way of the MTRR registers. @@ -2635,10 +2676,10 @@ functions: (*) readX_relaxed(), writeX_relaxed() These are similar to readX() and writeX(), but provide weaker memory - ordering guarantees. Specifically, they do not guarantee ordering with + ordering guarantees. Specifically, they do not guarantee ordering with respect to normal memory accesses (e.g. DMA buffers) nor do they guarantee - ordering with respect to LOCK or UNLOCK operations. If the latter is - required, an mmiowb() barrier can be used. Note that relaxed accesses to + ordering with respect to LOCK or UNLOCK operations. If the latter is + required, an mmiowb() barrier can be used. Note that relaxed accesses to the same peripheral are guaranteed to be ordered with respect to each other. @@ -3040,8 +3081,9 @@ The Alpha defines the Linux kernel's memory barrier model. See the subsection on "Cache Coherency" above. + VIRTUAL MACHINE GUESTS -------------------- +---------------------- Guests running within virtual machines might be affected by SMP effects even if the guest itself is compiled without SMP support. This is an artifact of @@ -3050,7 +3092,7 @@ barriers for this use-case would be possible but is often suboptimal. To handle this case optimally, low-level virt_mb() etc macros are available. These have the same effect as smp_mb() etc when SMP is enabled, but generate -identical code for SMP and non-SMP systems. For example, virtual machine guests +identical code for SMP and non-SMP systems. For example, virtual machine guests should use virt_mb() rather than smp_mb() when synchronizing against a (possibly SMP) host. @@ -3058,6 +3100,7 @@ These are equivalent to smp_mb() etc counterparts in all other respects, in particular, they do not control MMIO effects: to control MMIO effects, use mandatory barriers. + ============ EXAMPLE USES ============ diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index fcddfd5ded99..daabdd7ee543 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -60,6 +60,7 @@ show up in /proc/sys/kernel: - panic_on_warn - perf_cpu_time_max_percent - perf_event_paranoid +- perf_event_max_stack - pid_max - powersave-nap [ PPC only ] - printk @@ -654,6 +655,19 @@ users (without CAP_SYS_ADMIN). The default value is 2. ============================================================== +perf_event_max_stack: + +Controls maximum number of stack frames to copy for (attr.sample_type & +PERF_SAMPLE_CALLCHAIN) configured events, for instance, when using +'perf record -g' or 'perf trace --call-graph fp'. + +This can only be done when no events are in use that have callchains +enabled, otherwise writing to this file will return -EBUSY. + +The default value is 127. + +============================================================== + pid_max: PID allocation wrap value. When the kernel's next PID value diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index f52f297cb406..9857606dd7b7 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -1562,12 +1562,12 @@ Doing the same with chrt -r 5 and function-trace set. <idle>-0 3dN.1 12us : menu_hrtimer_cancel <-tick_nohz_idle_exit <idle>-0 3dN.1 12us : ktime_get <-tick_nohz_idle_exit <idle>-0 3dN.1 12us : tick_do_update_jiffies64 <-tick_nohz_idle_exit - <idle>-0 3dN.1 13us : update_cpu_load_nohz <-tick_nohz_idle_exit - <idle>-0 3dN.1 13us : _raw_spin_lock <-update_cpu_load_nohz + <idle>-0 3dN.1 13us : cpu_load_update_nohz <-tick_nohz_idle_exit + <idle>-0 3dN.1 13us : _raw_spin_lock <-cpu_load_update_nohz <idle>-0 3dN.1 13us : add_preempt_count <-_raw_spin_lock - <idle>-0 3dN.2 13us : __update_cpu_load <-update_cpu_load_nohz - <idle>-0 3dN.2 14us : sched_avg_update <-__update_cpu_load - <idle>-0 3dN.2 14us : _raw_spin_unlock <-update_cpu_load_nohz + <idle>-0 3dN.2 13us : __cpu_load_update <-cpu_load_update_nohz + <idle>-0 3dN.2 14us : sched_avg_update <-__cpu_load_update + <idle>-0 3dN.2 14us : _raw_spin_unlock <-cpu_load_update_nohz <idle>-0 3dN.2 14us : sub_preempt_count <-_raw_spin_unlock <idle>-0 3dN.1 15us : calc_load_exit_idle <-tick_nohz_idle_exit <idle>-0 3dN.1 15us : touch_softlockup_watchdog <-tick_nohz_idle_exit diff --git a/Documentation/x86/pat.txt b/Documentation/x86/pat.txt index 54944c71b819..2a4ee6302122 100644 --- a/Documentation/x86/pat.txt +++ b/Documentation/x86/pat.txt @@ -196,3 +196,35 @@ Another, more verbose way of getting PAT related debug messages is with "debugpat" boot parameter. With this parameter, various debug messages are printed to dmesg log. +PAT Initialization +------------------ + +The following table describes how PAT is initialized under various +configurations. The PAT MSR must be updated by Linux in order to support WC +and WT attributes. Otherwise, the PAT MSR has the value programmed in it +by the firmware. Note, Xen enables WC attribute in the PAT MSR for guests. + + MTRR PAT Call Sequence PAT State PAT MSR + ========================================================= + E E MTRR -> PAT init Enabled OS + E D MTRR -> PAT init Disabled - + D E MTRR -> PAT disable Disabled BIOS + D D MTRR -> PAT disable Disabled - + - np/E PAT -> PAT disable Disabled BIOS + - np/D PAT -> PAT disable Disabled - + E !P/E MTRR -> PAT init Disabled BIOS + D !P/E MTRR -> PAT disable Disabled BIOS + !M !P/E MTRR stub -> PAT disable Disabled BIOS + + Legend + ------------------------------------------------ + E Feature enabled in CPU + D Feature disabled/unsupported in CPU + np "nopat" boot option specified + !P CONFIG_X86_PAT option unset + !M CONFIG_MTRR option unset + Enabled PAT state set to enabled + Disabled PAT state set to disabled + OS PAT initializes PAT MSR with OS setting + BIOS PAT keeps PAT MSR with BIOS setting + @@ -1,7 +1,7 @@ VERSION = 4 PATCHLEVEL = 6 SUBLEVEL = 0 -EXTRAVERSION = -rc7 +EXTRAVERSION = NAME = Charred Weasel # *DOCUMENTATION* diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h index a83bbea62c67..0131a7058778 100644 --- a/arch/alpha/include/asm/rwsem.h +++ b/arch/alpha/include/asm/rwsem.h @@ -63,7 +63,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) return res >= 0 ? 1 : 0; } -static inline void __down_write(struct rw_semaphore *sem) +static inline long ___down_write(struct rw_semaphore *sem) { long oldcount; #ifndef CONFIG_SMP @@ -83,10 +83,24 @@ static inline void __down_write(struct rw_semaphore *sem) :"=&r" (oldcount), "=m" (sem->count), "=&r" (temp) :"Ir" (RWSEM_ACTIVE_WRITE_BIAS), "m" (sem->count) : "memory"); #endif - if (unlikely(oldcount)) + return oldcount; +} + +static inline void __down_write(struct rw_semaphore *sem) +{ + if (unlikely(___down_write(sem))) rwsem_down_write_failed(sem); } +static inline int __down_write_killable(struct rw_semaphore *sem) +{ + if (unlikely(___down_write(sem))) + if (IS_ERR(rwsem_down_write_failed_killable(sem))) + return -EINTR; + + return 0; +} + /* * trylock for writing -- returns 1 if successful, 0 if contention */ diff --git a/arch/arm/boot/dts/at91sam9x5.dtsi b/arch/arm/boot/dts/at91sam9x5.dtsi index 0827d594b1f0..cd0cd5fd09a3 100644 --- a/arch/arm/boot/dts/at91sam9x5.dtsi +++ b/arch/arm/boot/dts/at91sam9x5.dtsi @@ -106,7 +106,7 @@ pmc: pmc@fffffc00 { compatible = "atmel,at91sam9x5-pmc", "syscon"; - reg = <0xfffffc00 0x100>; + reg = <0xfffffc00 0x200>; interrupts = <1 IRQ_TYPE_LEVEL_HIGH 7>; interrupt-controller; #address-cells = <1>; diff --git a/arch/arm/boot/dts/sama5d2.dtsi b/arch/arm/boot/dts/sama5d2.dtsi index 78996bdbd3df..9817090c1b73 100644 --- a/arch/arm/boot/dts/sama5d2.dtsi +++ b/arch/arm/boot/dts/sama5d2.dtsi @@ -280,7 +280,7 @@ status = "disabled"; nfc@c0000000 { - compatible = "atmel,sama5d4-nfc"; + compatible = "atmel,sama5d3-nfc"; #address-cells = <1>; #size-cells = <1>; reg = < /* NFC Command Registers */ diff --git a/arch/arm/include/asm/efi.h b/arch/arm/include/asm/efi.h index e0eea72deb87..a708fa1f0905 100644 --- a/arch/arm/include/asm/efi.h +++ b/arch/arm/include/asm/efi.h @@ -17,34 +17,28 @@ #include <asm/mach/map.h> #include <asm/mmu_context.h> #include <asm/pgtable.h> +#include <asm/ptrace.h> #ifdef CONFIG_EFI void efi_init(void); int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md); +int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md); -#define efi_call_virt(f, ...) \ -({ \ - efi_##f##_t *__f; \ - efi_status_t __s; \ - \ - efi_virtmap_load(); \ - __f = efi.systab->runtime->f; \ - __s = __f(__VA_ARGS__); \ - efi_virtmap_unload(); \ - __s; \ -}) +#define arch_efi_call_virt_setup() efi_virtmap_load() +#define arch_efi_call_virt_teardown() efi_virtmap_unload() -#define __efi_call_virt(f, ...) \ +#define arch_efi_call_virt(f, args...) \ ({ \ efi_##f##_t *__f; \ - \ - efi_virtmap_load(); \ __f = efi.systab->runtime->f; \ - __f(__VA_ARGS__); \ - efi_virtmap_unload(); \ + __f(args); \ }) +#define ARCH_EFI_IRQ_FLAGS_MASK \ + (PSR_J_BIT | PSR_E_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | \ + PSR_T_BIT | MODE_MASK) + static inline void efi_set_pgd(struct mm_struct *mm) { check_and_switch_context(mm, NULL); @@ -59,7 +53,16 @@ void efi_virtmap_unload(void); /* arch specific definitions used by the stub code */ -#define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__) +#define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__) +#define __efi_call_early(f, ...) f(__VA_ARGS__) +#define efi_is_64bit() (false) + +struct screen_info *alloc_screen_info(efi_system_table_t *sys_table_arg); +void free_screen_info(efi_system_table_t *sys_table, struct screen_info *si); + +static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt) +{ +} /* * A reasonable upper bound for the uncompressed kernel size is 32 MBytes, diff --git a/arch/arm/include/asm/mmu_context.h b/arch/arm/include/asm/mmu_context.h index fa5b42d44985..3cc14dd8587c 100644 --- a/arch/arm/include/asm/mmu_context.h +++ b/arch/arm/include/asm/mmu_context.h @@ -15,6 +15,7 @@ #include <linux/compiler.h> #include <linux/sched.h> +#include <linux/preempt.h> #include <asm/cacheflush.h> #include <asm/cachetype.h> #include <asm/proc-fns.h> @@ -66,6 +67,7 @@ static inline void check_and_switch_context(struct mm_struct *mm, cpu_switch_mm(mm->pgd, mm); } +#ifndef MODULE #define finish_arch_post_lock_switch \ finish_arch_post_lock_switch static inline void finish_arch_post_lock_switch(void) @@ -87,6 +89,7 @@ static inline void finish_arch_post_lock_switch(void) preempt_enable_no_resched(); } } +#endif /* !MODULE */ #endif /* CONFIG_MMU */ diff --git a/arch/arm/kernel/efi.c b/arch/arm/kernel/efi.c index ff8a9d8acfac..9f43ba012d10 100644 --- a/arch/arm/kernel/efi.c +++ b/arch/arm/kernel/efi.c @@ -11,6 +11,41 @@ #include <asm/mach/map.h> #include <asm/mmu_context.h> +static int __init set_permissions(pte_t *ptep, pgtable_t token, + unsigned long addr, void *data) +{ + efi_memory_desc_t *md = data; + pte_t pte = *ptep; + + if (md->attribute & EFI_MEMORY_RO) + pte = set_pte_bit(pte, __pgprot(L_PTE_RDONLY)); + if (md->attribute & EFI_MEMORY_XP) + pte = set_pte_bit(pte, __pgprot(L_PTE_XN)); + set_pte_ext(ptep, pte, PTE_EXT_NG); + return 0; +} + +int __init efi_set_mapping_permissions(struct mm_struct *mm, + efi_memory_desc_t *md) +{ + unsigned long base, size; + + base = md->virt_addr; + size = md->num_pages << EFI_PAGE_SHIFT; + + /* + * We can only use apply_to_page_range() if we can guarantee that the + * entire region was mapped using pages. This should be the case if the + * region does not cover any naturally aligned SECTION_SIZE sized + * blocks. + */ + if (round_down(base + size, SECTION_SIZE) < + round_up(base, SECTION_SIZE) + SECTION_SIZE) + return apply_to_page_range(mm, base, size, set_permissions, md); + + return 0; +} + int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) { struct map_desc desc = { @@ -34,5 +69,11 @@ int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) desc.type = MT_DEVICE; create_mapping_late(mm, &desc, true); + + /* + * If stricter permissions were specified, apply them now. + */ + if (md->attribute & (EFI_MEMORY_RO | EFI_MEMORY_XP)) + return efi_set_mapping_permissions(mm, md); return 0; } diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c index 6284779d64ee..b8df45883cf7 100644 --- a/arch/arm/kernel/hw_breakpoint.c +++ b/arch/arm/kernel/hw_breakpoint.c @@ -631,7 +631,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) info->address &= ~alignment_mask; info->ctrl.len <<= offset; - if (!bp->overflow_handler) { + if (is_default_overflow_handler(bp)) { /* * Mismatch breakpoints are required for single-stepping * breakpoints. @@ -754,7 +754,7 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr, * mismatch breakpoint so we can single-step over the * watchpoint trigger. */ - if (!wp->overflow_handler) + if (is_default_overflow_handler(wp)) enable_single_step(wp, instruction_pointer(regs)); unlock: diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c index 4e02ae5950ff..27563befa8a2 100644 --- a/arch/arm/kernel/perf_callchain.c +++ b/arch/arm/kernel/perf_callchain.c @@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) tail = (struct frame_tail __user *)regs->ARM_fp - 1; - while ((entry->nr < PERF_MAX_STACK_DEPTH) && + while ((entry->nr < sysctl_perf_event_max_stack) && tail && !((unsigned long)tail & 0x3)) tail = user_backtrace(tail, entry); } diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 2c4bea39cf22..7d4e2850910c 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -883,7 +883,8 @@ static void __init request_standard_resources(const struct machine_desc *mdesc) request_resource(&ioport_resource, &lp2); } -#if defined(CONFIG_VGA_CONSOLE) || defined(CONFIG_DUMMY_CONSOLE) +#if defined(CONFIG_VGA_CONSOLE) || defined(CONFIG_DUMMY_CONSOLE) || \ + defined(CONFIG_EFI) struct screen_info screen_info = { .orig_video_lines = 30, .orig_video_cols = 80, diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms index efa77c146415..521b1ec59157 100644 --- a/arch/arm64/Kconfig.platforms +++ b/arch/arm64/Kconfig.platforms @@ -2,6 +2,7 @@ menu "Platform selection" config ARCH_SUNXI bool "Allwinner sunxi 64-bit SoC Family" + select GENERIC_IRQ_CHIP help This enables support for Allwinner sunxi based SoCs like the A64. diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h index 8e88a696c9cb..622db3c6474e 100644 --- a/arch/arm64/include/asm/efi.h +++ b/arch/arm64/include/asm/efi.h @@ -4,6 +4,7 @@ #include <asm/io.h> #include <asm/mmu_context.h> #include <asm/neon.h> +#include <asm/ptrace.h> #include <asm/tlbflush.h> #ifdef CONFIG_EFI @@ -14,32 +15,29 @@ extern void efi_init(void); int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md); -#define efi_call_virt(f, ...) \ +#define efi_set_mapping_permissions efi_create_mapping + +#define arch_efi_call_virt_setup() \ ({ \ - efi_##f##_t *__f; \ - efi_status_t __s; \ - \ kernel_neon_begin(); \ efi_virtmap_load(); \ - __f = efi.systab->runtime->f; \ - __s = __f(__VA_ARGS__); \ - efi_virtmap_unload(); \ - kernel_neon_end(); \ - __s; \ }) -#define __efi_call_virt(f, ...) \ +#define arch_efi_call_virt(f, args...) \ ({ \ efi_##f##_t *__f; \ - \ - kernel_neon_begin(); \ - efi_virtmap_load(); \ __f = efi.systab->runtime->f; \ - __f(__VA_ARGS__); \ + __f(args); \ +}) + +#define arch_efi_call_virt_teardown() \ +({ \ efi_virtmap_unload(); \ kernel_neon_end(); \ }) +#define ARCH_EFI_IRQ_FLAGS_MASK (PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT) + /* arch specific definitions used by the stub code */ /* @@ -50,7 +48,16 @@ int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md); #define EFI_FDT_ALIGN SZ_2M /* used by allocate_new_fdt_and_exit_boot() */ #define MAX_FDT_OFFSET SZ_512M -#define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__) +#define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__) +#define __efi_call_early(f, ...) f(__VA_ARGS__) +#define efi_is_64bit() (true) + +#define alloc_screen_info(x...) &screen_info +#define free_screen_info(x...) + +static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt) +{ +} #define EFI_ALLOC_ALIGN SZ_64K diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index b6abc852f2a1..78f52488f9ff 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -17,22 +17,51 @@ #include <asm/efi.h> -int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) +/* + * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be + * executable, everything else can be mapped with the XN bits + * set. Also take the new (optional) RO/XP bits into account. + */ +static __init pteval_t create_mapping_protection(efi_memory_desc_t *md) { - pteval_t prot_val; + u64 attr = md->attribute; + u32 type = md->type; - /* - * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be - * executable, everything else can be mapped with the XN bits - * set. - */ - if ((md->attribute & EFI_MEMORY_WB) == 0) - prot_val = PROT_DEVICE_nGnRE; - else if (md->type == EFI_RUNTIME_SERVICES_CODE || - !PAGE_ALIGNED(md->phys_addr)) - prot_val = pgprot_val(PAGE_KERNEL_EXEC); - else - prot_val = pgprot_val(PAGE_KERNEL); + if (type == EFI_MEMORY_MAPPED_IO) + return PROT_DEVICE_nGnRE; + + if (WARN_ONCE(!PAGE_ALIGNED(md->phys_addr), + "UEFI Runtime regions are not aligned to 64 KB -- buggy firmware?")) + /* + * If the region is not aligned to the page size of the OS, we + * can not use strict permissions, since that would also affect + * the mapping attributes of the adjacent regions. + */ + return pgprot_val(PAGE_KERNEL_EXEC); + + /* R-- */ + if ((attr & (EFI_MEMORY_XP | EFI_MEMORY_RO)) == + (EFI_MEMORY_XP | EFI_MEMORY_RO)) + return pgprot_val(PAGE_KERNEL_RO); + + /* R-X */ + if (attr & EFI_MEMORY_RO) + return pgprot_val(PAGE_KERNEL_ROX); + + /* RW- */ + if (attr & EFI_MEMORY_XP || type != EFI_RUNTIME_SERVICES_CODE) + return pgprot_val(PAGE_KERNEL); + + /* RWX */ + return pgprot_val(PAGE_KERNEL_EXEC); +} + +/* we will fill this structure from the stub, so don't put it in .bss */ +struct screen_info screen_info __section(.data); + +int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) +{ + pteval_t prot_val = create_mapping_protection(md); create_pgd_mapping(mm, md->phys_addr, md->virt_addr, md->num_pages << EFI_PAGE_SHIFT, diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index b45c95d34b83..4ef5373f9a76 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -616,7 +616,7 @@ static int breakpoint_handler(unsigned long unused, unsigned int esr, perf_bp_event(bp, regs); /* Do we need to handle the stepping? */ - if (!bp->overflow_handler) + if (is_default_overflow_handler(bp)) step = 1; unlock: rcu_read_unlock(); @@ -712,7 +712,7 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr, perf_bp_event(wp, regs); /* Do we need to handle the stepping? */ - if (!wp->overflow_handler) + if (is_default_overflow_handler(wp)) step = 1; unlock: diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h index 5e360ce88f10..1428849aece8 100644 --- a/arch/arm64/kernel/image.h +++ b/arch/arm64/kernel/image.h @@ -112,6 +112,7 @@ __efistub___memset = KALLSYMS_HIDE(__pi_memset); __efistub__text = KALLSYMS_HIDE(_text); __efistub__end = KALLSYMS_HIDE(_end); __efistub__edata = KALLSYMS_HIDE(_edata); +__efistub_screen_info = KALLSYMS_HIDE(screen_info); #endif diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index ff4665462a02..32c3c6e70119 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry, tail = (struct frame_tail __user *)regs->regs[29]; - while (entry->nr < PERF_MAX_STACK_DEPTH && + while (entry->nr < sysctl_perf_event_max_stack && tail && !((unsigned long)tail & 0xf)) tail = user_backtrace(tail, entry); } else { @@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry, tail = (struct compat_frame_tail __user *)regs->compat_fp - 1; - while ((entry->nr < PERF_MAX_STACK_DEPTH) && + while ((entry->nr < sysctl_perf_event_max_stack) && tail && !((unsigned long)tail & 0x3)) tail = compat_user_backtrace(tail, entry); #endif diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index a34420a5df9a..b405bbb54431 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -476,6 +476,7 @@ emit_cond_jmp: case BPF_JGE: jmp_cond = A64_COND_CS; break; + case BPF_JSET: case BPF_JNE: jmp_cond = A64_COND_NE; break; diff --git a/arch/ia64/include/asm/iommu.h b/arch/ia64/include/asm/iommu.h index 105c93b00b1b..1d1212901ae7 100644 --- a/arch/ia64/include/asm/iommu.h +++ b/arch/ia64/include/asm/iommu.h @@ -1,7 +1,6 @@ #ifndef _ASM_IA64_IOMMU_H #define _ASM_IA64_IOMMU_H 1 -#define cpu_has_x2apic 0 /* 10 seconds */ #define DMAR_OPERATION_TIMEOUT (((cycles_t) local_cpu_data->itc_freq)*10) diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h index ce112472bdd6..8b23e070b844 100644 --- a/arch/ia64/include/asm/rwsem.h +++ b/arch/ia64/include/asm/rwsem.h @@ -49,8 +49,8 @@ __down_read (struct rw_semaphore *sem) /* * lock for writing */ -static inline void -__down_write (struct rw_semaphore *sem) +static inline long +___down_write (struct rw_semaphore *sem) { long old, new; @@ -59,10 +59,26 @@ __down_write (struct rw_semaphore *sem) new = old + RWSEM_ACTIVE_WRITE_BIAS; } while (cmpxchg_acq(&sem->count, old, new) != old); - if (old != 0) + return old; +} + +static inline void +__down_write (struct rw_semaphore *sem) +{ + if (___down_write(sem)) rwsem_down_write_failed(sem); } +static inline int +__down_write_killable (struct rw_semaphore *sem) +{ + if (___down_write(sem)) + if (IS_ERR(rwsem_down_write_failed_killable(sem))) + return -EINTR; + + return 0; +} + /* * unlock after reading */ diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index 300dac3702f1..bf0865cd438a 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -531,8 +531,6 @@ efi_init (void) efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff, vendor); - set_bit(EFI_SYSTEM_TABLES, &efi.flags); - palo_phys = EFI_INVALID_TABLE_ADDR; if (efi_config_init(arch_tables) != 0) diff --git a/arch/metag/kernel/perf_callchain.c b/arch/metag/kernel/perf_callchain.c index 315633461a94..252abc12a5a3 100644 --- a/arch/metag/kernel/perf_callchain.c +++ b/arch/metag/kernel/perf_callchain.c @@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) --frame; - while ((entry->nr < PERF_MAX_STACK_DEPTH) && frame) + while ((entry->nr < sysctl_perf_event_max_stack) && frame) frame = user_backtrace(frame, entry); } diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c index c1cf9c6c3f77..5021c546ad07 100644 --- a/arch/mips/kernel/perf_event.c +++ b/arch/mips/kernel/perf_event.c @@ -35,7 +35,7 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry, addr = *sp++; if (__kernel_text_address(addr)) { perf_callchain_store(entry, addr); - if (entry->nr >= PERF_MAX_STACK_DEPTH) + if (entry->nr >= sysctl_perf_event_max_stack) break; } } @@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry, } do { perf_callchain_store(entry, pc); - if (entry->nr >= PERF_MAX_STACK_DEPTH) + if (entry->nr >= sysctl_perf_event_max_stack) break; pc = unwind_stack(current, &sp, pc, &ra); } while (pc); diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 8cac1eb41466..55c924b65f71 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -565,7 +565,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) smp_ops->give_timebase(); /* Wait until cpu puts itself in the online & active maps */ - while (!cpu_online(cpu) || !cpu_active(cpu)) + while (!cpu_online(cpu)) cpu_relax(); return 0; diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index e04a6752b399..22d9015c1acc 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry, sp = regs->gpr[1]; perf_callchain_store(entry, next_ip); - while (entry->nr < PERF_MAX_STACK_DEPTH) { + while (entry->nr < sysctl_perf_event_max_stack) { fp = (unsigned long __user *) sp; if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp)) return; @@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry, sp = regs->gpr[1]; perf_callchain_store(entry, next_ip); - while (entry->nr < PERF_MAX_STACK_DEPTH) { + while (entry->nr < sysctl_perf_event_max_stack) { fp = (unsigned int __user *) (unsigned long) sp; if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp)) return; diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h index fead491dfc28..c75e4471e618 100644 --- a/arch/s390/include/asm/rwsem.h +++ b/arch/s390/include/asm/rwsem.h @@ -90,7 +90,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +static inline long ___down_write(struct rw_semaphore *sem) { signed long old, new, tmp; @@ -104,13 +104,23 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) : "=&d" (old), "=&d" (new), "=Q" (sem->count) : "Q" (sem->count), "m" (tmp) : "cc", "memory"); - if (old != 0) - rwsem_down_write_failed(sem); + + return old; } static inline void __down_write(struct rw_semaphore *sem) { - __down_write_nested(sem, 0); + if (___down_write(sem)) + rwsem_down_write_failed(sem); +} + +static inline int __down_write_killable(struct rw_semaphore *sem) +{ + if (___down_write(sem)) + if (IS_ERR(rwsem_down_write_failed_killable(sem))) + return -EINTR; + + return 0; } /* diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 40a6b4f9c36c..7b89a7572100 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -832,7 +832,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) pcpu_attach_task(pcpu, tidle); pcpu_start_fn(pcpu, smp_start_secondary, NULL); /* Wait until cpu puts itself in the online & active maps */ - while (!cpu_online(cpu) || !cpu_active(cpu)) + while (!cpu_online(cpu)) cpu_relax(); return 0; } diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild index a319745a7b63..751c3373a92c 100644 --- a/arch/sh/include/asm/Kbuild +++ b/arch/sh/include/asm/Kbuild @@ -26,6 +26,7 @@ generic-y += percpu.h generic-y += poll.h generic-y += preempt.h generic-y += resource.h +generic-y += rwsem.h generic-y += sembuf.h generic-y += serial.h generic-y += shmbuf.h diff --git a/arch/sh/include/asm/rwsem.h b/arch/sh/include/asm/rwsem.h deleted file mode 100644 index edab57265293..000000000000 --- a/arch/sh/include/asm/rwsem.h +++ /dev/null @@ -1,132 +0,0 @@ -/* - * include/asm-sh/rwsem.h: R/W semaphores for SH using the stuff - * in lib/rwsem.c. - */ - -#ifndef _ASM_SH_RWSEM_H -#define _ASM_SH_RWSEM_H - -#ifndef _LINUX_RWSEM_H -#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead" -#endif - -#ifdef __KERNEL__ - -#define RWSEM_UNLOCKED_VALUE 0x00000000 -#define RWSEM_ACTIVE_BIAS 0x00000001 -#define RWSEM_ACTIVE_MASK 0x0000ffff -#define RWSEM_WAITING_BIAS (-0x00010000) -#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS -#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - -/* - * lock for reading - */ -static inline void __down_read(struct rw_semaphore *sem) -{ - if (atomic_inc_return((atomic_t *)(&sem->count)) > 0) - smp_wmb(); - else - rwsem_down_read_failed(sem); -} - -static inline int __down_read_trylock(struct rw_semaphore *sem) -{ - int tmp; - - while ((tmp = sem->count) >= 0) { - if (tmp == cmpxchg(&sem->count, tmp, - tmp + RWSEM_ACTIVE_READ_BIAS)) { - smp_wmb(); - return 1; - } - } - return 0; -} - -/* - * lock for writing - */ -static inline void __down_write(struct rw_semaphore *sem) -{ - int tmp; - - tmp = atomic_add_return(RWSEM_ACTIVE_WRITE_BIAS, - (atomic_t *)(&sem->count)); - if (tmp == RWSEM_ACTIVE_WRITE_BIAS) - smp_wmb(); - else - rwsem_down_write_failed(sem); -} - -static inline int __down_write_trylock(struct rw_semaphore *sem) -{ - int tmp; - - tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); - smp_wmb(); - return tmp == RWSEM_UNLOCKED_VALUE; -} - -/* - * unlock after reading - */ -static inline void __up_read(struct rw_semaphore *sem) -{ - int tmp; - - smp_wmb(); - tmp = atomic_dec_return((atomic_t *)(&sem->count)); - if (tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0) - rwsem_wake(sem); -} - -/* - * unlock after writing - */ -static inline void __up_write(struct rw_semaphore *sem) -{ - smp_wmb(); - if (atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS, - (atomic_t *)(&sem->count)) < 0) - rwsem_wake(sem); -} - -/* - * implement atomic add functionality - */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) -{ - atomic_add(delta, (atomic_t *)(&sem->count)); -} - -/* - * downgrade write lock to read lock - */ -static inline void __downgrade_write(struct rw_semaphore *sem) -{ - int tmp; - - smp_wmb(); - tmp = atomic_add_return(-RWSEM_WAITING_BIAS, (atomic_t *)(&sem->count)); - if (tmp < 0) - rwsem_downgrade_wake(sem); -} - -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) -{ - __down_write(sem); -} - -/* - * implement exchange and add functionality - */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) -{ - smp_mb(); - return atomic_add_return(delta, (atomic_t *)(&sem->count)); -} - -#endif /* __KERNEL__ */ -#endif /* _ASM_SH_RWSEM_H */ diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild index e928618838bc..6024c26c0585 100644 --- a/arch/sparc/include/asm/Kbuild +++ b/arch/sparc/include/asm/Kbuild @@ -16,6 +16,7 @@ generic-y += mm-arch-hooks.h generic-y += module.h generic-y += mutex.h generic-y += preempt.h +generic-y += rwsem.h generic-y += serial.h generic-y += trace_clock.h generic-y += types.h diff --git a/arch/sparc/include/asm/rwsem.h b/arch/sparc/include/asm/rwsem.h deleted file mode 100644 index 069bf4d663a1..000000000000 --- a/arch/sparc/include/asm/rwsem.h +++ /dev/null @@ -1,124 +0,0 @@ -/* - * rwsem.h: R/W semaphores implemented using CAS - * - * Written by David S. Miller (davem@redhat.com), 2001. - * Derived from asm-i386/rwsem.h - */ -#ifndef _SPARC64_RWSEM_H -#define _SPARC64_RWSEM_H - -#ifndef _LINUX_RWSEM_H -#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead" -#endif - -#ifdef __KERNEL__ - -#define RWSEM_UNLOCKED_VALUE 0x00000000L -#define RWSEM_ACTIVE_BIAS 0x00000001L -#define RWSEM_ACTIVE_MASK 0xffffffffL -#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1) -#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS -#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - -/* - * lock for reading - */ -static inline void __down_read(struct rw_semaphore *sem) -{ - if (unlikely(atomic64_inc_return((atomic64_t *)(&sem->count)) <= 0L)) - rwsem_down_read_failed(sem); -} - -static inline int __down_read_trylock(struct rw_semaphore *sem) -{ - long tmp; - - while ((tmp = sem->count) >= 0L) { - if (tmp == cmpxchg(&sem->count, tmp, - tmp + RWSEM_ACTIVE_READ_BIAS)) { - return 1; - } - } - return 0; -} - -/* - * lock for writing - */ -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) -{ - long tmp; - - tmp = atomic64_add_return(RWSEM_ACTIVE_WRITE_BIAS, - (atomic64_t *)(&sem->count)); - if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) - rwsem_down_write_failed(sem); -} - -static inline void __down_write(struct rw_semaphore *sem) -{ - __down_write_nested(sem, 0); -} - -static inline int __down_write_trylock(struct rw_semaphore *sem) -{ - long tmp; - - tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); - return tmp == RWSEM_UNLOCKED_VALUE; -} - -/* - * unlock after reading - */ -static inline void __up_read(struct rw_semaphore *sem) -{ - long tmp; - - tmp = atomic64_dec_return((atomic64_t *)(&sem->count)); - if (unlikely(tmp < -1L && (tmp & RWSEM_ACTIVE_MASK) == 0L)) - rwsem_wake(sem); -} - -/* - * unlock after writing - */ -static inline void __up_write(struct rw_semaphore *sem) -{ - if (unlikely(atomic64_sub_return(RWSEM_ACTIVE_WRITE_BIAS, - (atomic64_t *)(&sem->count)) < 0L)) - rwsem_wake(sem); -} - -/* - * implement atomic add functionality - */ -static inline void rwsem_atomic_add(long delta, struct rw_semaphore *sem) -{ - atomic64_add(delta, (atomic64_t *)(&sem->count)); -} - -/* - * downgrade write lock to read lock - */ -static inline void __downgrade_write(struct rw_semaphore *sem) -{ - long tmp; - - tmp = atomic64_add_return(-RWSEM_WAITING_BIAS, (atomic64_t *)(&sem->count)); - if (tmp < 0L) - rwsem_downgrade_wake(sem); -} - -/* - * implement exchange and add functionality - */ -static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem) -{ - return atomic64_add_return(delta, (atomic64_t *)(&sem->count)); -} - -#endif /* __KERNEL__ */ - -#endif /* _SPARC64_RWSEM_H */ diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 6596f66ce112..a4b8b5aed21c 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry, } } #endif - } while (entry->nr < PERF_MAX_STACK_DEPTH); + } while (entry->nr < sysctl_perf_event_max_stack); } static inline int @@ -1790,7 +1790,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry, pc = sf.callers_pc; ufp = (unsigned long)sf.fp + STACK_BIAS; perf_callchain_store(entry, pc); - } while (entry->nr < PERF_MAX_STACK_DEPTH); + } while (entry->nr < sysctl_perf_event_max_stack); } static void perf_callchain_user_32(struct perf_callchain_entry *entry, @@ -1822,7 +1822,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry, ufp = (unsigned long)sf.fp; } perf_callchain_store(entry, pc); - } while (entry->nr < PERF_MAX_STACK_DEPTH); + } while (entry->nr < sysctl_perf_event_max_stack); } void diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2dc18605831f..7bb15747fea2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -164,10 +164,6 @@ config INSTRUCTION_DECODER def_bool y depends on KPROBES || PERF_EVENTS || UPROBES -config PERF_EVENTS_INTEL_UNCORE - def_bool y - depends on PERF_EVENTS && CPU_SUP_INTEL && PCI - config OUTPUT_FORMAT string default "elf32-i386" if X86_32 @@ -1046,6 +1042,8 @@ config X86_THERMAL_VECTOR def_bool y depends on X86_MCE_INTEL +source "arch/x86/events/Kconfig" + config X86_LEGACY_VM86 bool "Legacy VM86 support" default n @@ -1210,15 +1208,6 @@ config MICROCODE_OLD_INTERFACE def_bool y depends on MICROCODE -config PERF_EVENTS_AMD_POWER - depends on PERF_EVENTS && CPU_SUP_AMD - tristate "AMD Processor Power Reporting Mechanism" - ---help--- - Provide power reporting mechanism support for AMD processors. - Currently, it leverages X86_FEATURE_ACC_POWER - (CPUID Fn8000_0007_EDX[12]) interface to calculate the - average power consumption on Family 15h processors. - config X86_MSR tristate "/dev/cpu/*/msr - Model-specific register support" ---help--- @@ -1932,54 +1921,38 @@ config RELOCATABLE (CONFIG_PHYSICAL_START) is used as the minimum location. config RANDOMIZE_BASE - bool "Randomize the address of the kernel image" + bool "Randomize the address of the kernel image (KASLR)" depends on RELOCATABLE default n ---help--- - Randomizes the physical and virtual address at which the - kernel image is decompressed, as a security feature that - deters exploit attempts relying on knowledge of the location - of kernel internals. + In support of Kernel Address Space Layout Randomization (KASLR), + this randomizes the physical address at which the kernel image + is decompressed and the virtual address where the kernel + image is mapped, as a security feature that deters exploit + attempts relying on knowledge of the location of kernel + code internals. + + The kernel physical and virtual address can be randomized + from 16MB up to 1GB on 64-bit and 512MB on 32-bit. (Note that + using RANDOMIZE_BASE reduces the memory space available to + kernel modules from 1.5GB to 1GB.) + + Entropy is generated using the RDRAND instruction if it is + supported. If RDTSC is supported, its value is mixed into + the entropy pool as well. If neither RDRAND nor RDTSC are + supported, then entropy is read from the i8254 timer. + + Since the kernel is built using 2GB addressing, and + PHYSICAL_ALIGN must be at a minimum of 2MB, only 10 bits of + entropy is theoretically possible. Currently, with the + default value for PHYSICAL_ALIGN and due to page table + layouts, 64-bit uses 9 bits of entropy and 32-bit uses 8 bits. + + If CONFIG_HIBERNATE is also enabled, KASLR is disabled at boot + time. To enable it, boot with "kaslr" on the kernel command + line (which will also disable hibernation). - Entropy is generated using the RDRAND instruction if it is - supported. If RDTSC is supported, it is used as well. If - neither RDRAND nor RDTSC are supported, then randomness is - read from the i8254 timer. - - The kernel will be offset by up to RANDOMIZE_BASE_MAX_OFFSET, - and aligned according to PHYSICAL_ALIGN. Since the kernel is - built using 2GiB addressing, and PHYSICAL_ALGIN must be at a - minimum of 2MiB, only 10 bits of entropy is theoretically - possible. At best, due to page table layouts, 64-bit can use - 9 bits of entropy and 32-bit uses 8 bits. - - If unsure, say N. - -config RANDOMIZE_BASE_MAX_OFFSET - hex "Maximum kASLR offset allowed" if EXPERT - depends on RANDOMIZE_BASE - range 0x0 0x20000000 if X86_32 - default "0x20000000" if X86_32 - range 0x0 0x40000000 if X86_64 - default "0x40000000" if X86_64 - ---help--- - The lesser of RANDOMIZE_BASE_MAX_OFFSET and available physical - memory is used to determine the maximal offset in bytes that will - be applied to the kernel when kernel Address Space Layout - Randomization (kASLR) is active. This must be a multiple of - PHYSICAL_ALIGN. - - On 32-bit this is limited to 512MiB by page table layouts. The - default is 512MiB. - - On 64-bit this is limited by how the kernel fixmap page table is - positioned, so this cannot be larger than 1GiB currently. Without - RANDOMIZE_BASE, there is a 512MiB to 1.5GiB split between kernel - and modules. When RANDOMIZE_BASE_MAX_OFFSET is above 512MiB, the - modules area will shrink to compensate, up to the current maximum - 1GiB to 1GiB split. The default is 1GiB. - - If unsure, leave at the default value. + If unsure, say N. # Relocation on x86 needs some additional build support config X86_NEED_RELOCS diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 4086abca0b32..6fce7f096b88 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -208,7 +208,8 @@ endif head-y := arch/x86/kernel/head_$(BITS).o head-y += arch/x86/kernel/head$(BITS).o -head-y += arch/x86/kernel/head.o +head-y += arch/x86/kernel/ebda.o +head-y += arch/x86/kernel/platform-quirks.o libs-y += arch/x86/lib/ diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index b1ef9e489084..700a9c6e6159 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -86,16 +86,7 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) -sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|_end\)$$/\#define VO_\2 0x\1/p' - -quiet_cmd_voffset = VOFFSET $@ - cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ - -targets += voffset.h -$(obj)/voffset.h: vmlinux FORCE - $(call if_changed,voffset) - -sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p' +sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|_ehead\|_text\|z_.*\)$$/\#define ZO_\2 0x\1/p' quiet_cmd_zoffset = ZOFFSET $@ cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@ @@ -106,7 +97,7 @@ $(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE AFLAGS_header.o += -I$(obj) -$(obj)/header.o: $(obj)/voffset.h $(obj)/zoffset.h +$(obj)/header.o: $(obj)/zoffset.h LDFLAGS_setup.elf := -T $(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 8774cb23064f..cfdd8c3f8af2 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -57,12 +57,27 @@ LDFLAGS_vmlinux := -T hostprogs-y := mkpiggy HOST_EXTRACFLAGS += -I$(srctree)/tools/include +sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p' + +quiet_cmd_voffset = VOFFSET $@ + cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ + +targets += ../voffset.h + +$(obj)/../voffset.h: vmlinux FORCE + $(call if_changed,voffset) + +$(obj)/misc.o: $(obj)/../voffset.h + vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ - $(obj)/string.o $(obj)/cmdline.o \ + $(obj)/string.o $(obj)/cmdline.o $(obj)/error.o \ $(obj)/piggy.o $(obj)/cpuflags.o vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o -vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/aslr.o +vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o +ifdef CONFIG_X86_64 + vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o +endif $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone @@ -109,10 +124,8 @@ suffix-$(CONFIG_KERNEL_XZ) := xz suffix-$(CONFIG_KERNEL_LZO) := lzo suffix-$(CONFIG_KERNEL_LZ4) := lz4 -RUN_SIZE = $(shell $(OBJDUMP) -h vmlinux | \ - $(CONFIG_SHELL) $(srctree)/arch/x86/tools/calc_run_size.sh) quiet_cmd_mkpiggy = MKPIGGY $@ - cmd_mkpiggy = $(obj)/mkpiggy $< $(RUN_SIZE) > $@ || ( rm -f $@ ; false ) + cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false ) targets += piggy.S $(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c deleted file mode 100644 index 6a9b96b4624d..000000000000 --- a/arch/x86/boot/compressed/aslr.c +++ /dev/null @@ -1,339 +0,0 @@ -#include "misc.h" - -#include <asm/msr.h> -#include <asm/archrandom.h> -#include <asm/e820.h> - -#include <generated/compile.h> -#include <linux/module.h> -#include <linux/uts.h> -#include <linux/utsname.h> -#include <generated/utsrelease.h> - -/* Simplified build-specific string for starting entropy. */ -static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" - LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; - -#define I8254_PORT_CONTROL 0x43 -#define I8254_PORT_COUNTER0 0x40 -#define I8254_CMD_READBACK 0xC0 -#define I8254_SELECT_COUNTER0 0x02 -#define I8254_STATUS_NOTREADY 0x40 -static inline u16 i8254(void) -{ - u16 status, timer; - - do { - outb(I8254_PORT_CONTROL, - I8254_CMD_READBACK | I8254_SELECT_COUNTER0); - status = inb(I8254_PORT_COUNTER0); - timer = inb(I8254_PORT_COUNTER0); - timer |= inb(I8254_PORT_COUNTER0) << 8; - } while (status & I8254_STATUS_NOTREADY); - - return timer; -} - -static unsigned long rotate_xor(unsigned long hash, const void *area, - size_t size) -{ - size_t i; - unsigned long *ptr = (unsigned long *)area; - - for (i = 0; i < size / sizeof(hash); i++) { - /* Rotate by odd number of bits and XOR. */ - hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7); - hash ^= ptr[i]; - } - - return hash; -} - -/* Attempt to create a simple but unpredictable starting entropy. */ -static unsigned long get_random_boot(void) -{ - unsigned long hash = 0; - - hash = rotate_xor(hash, build_str, sizeof(build_str)); - hash = rotate_xor(hash, real_mode, sizeof(*real_mode)); - - return hash; -} - -static unsigned long get_random_long(void) -{ -#ifdef CONFIG_X86_64 - const unsigned long mix_const = 0x5d6008cbf3848dd3UL; -#else - const unsigned long mix_const = 0x3f39e593UL; -#endif - unsigned long raw, random = get_random_boot(); - bool use_i8254 = true; - - debug_putstr("KASLR using"); - - if (has_cpuflag(X86_FEATURE_RDRAND)) { - debug_putstr(" RDRAND"); - if (rdrand_long(&raw)) { - random ^= raw; - use_i8254 = false; - } - } - - if (has_cpuflag(X86_FEATURE_TSC)) { - debug_putstr(" RDTSC"); - raw = rdtsc(); - - random ^= raw; - use_i8254 = false; - } - - if (use_i8254) { - debug_putstr(" i8254"); - random ^= i8254(); - } - - /* Circular multiply for better bit diffusion */ - asm("mul %3" - : "=a" (random), "=d" (raw) - : "a" (random), "rm" (mix_const)); - random += raw; - - debug_putstr("...\n"); - - return random; -} - -struct mem_vector { - unsigned long start; - unsigned long size; -}; - -#define MEM_AVOID_MAX 5 -static struct mem_vector mem_avoid[MEM_AVOID_MAX]; - -static bool mem_contains(struct mem_vector *region, struct mem_vector *item) -{ - /* Item at least partially before region. */ - if (item->start < region->start) - return false; - /* Item at least partially after region. */ - if (item->start + item->size > region->start + region->size) - return false; - return true; -} - -static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) -{ - /* Item one is entirely before item two. */ - if (one->start + one->size <= two->start) - return false; - /* Item one is entirely after item two. */ - if (one->start >= two->start + two->size) - return false; - return true; -} - -static void mem_avoid_init(unsigned long input, unsigned long input_size, - unsigned long output, unsigned long output_size) -{ - u64 initrd_start, initrd_size; - u64 cmd_line, cmd_line_size; - unsigned long unsafe, unsafe_len; - char *ptr; - - /* - * Avoid the region that is unsafe to overlap during - * decompression (see calculations at top of misc.c). - */ - unsafe_len = (output_size >> 12) + 32768 + 18; - unsafe = (unsigned long)input + input_size - unsafe_len; - mem_avoid[0].start = unsafe; - mem_avoid[0].size = unsafe_len; - - /* Avoid initrd. */ - initrd_start = (u64)real_mode->ext_ramdisk_image << 32; - initrd_start |= real_mode->hdr.ramdisk_image; - initrd_size = (u64)real_mode->ext_ramdisk_size << 32; - initrd_size |= real_mode->hdr.ramdisk_size; - mem_avoid[1].start = initrd_start; - mem_avoid[1].size = initrd_size; - - /* Avoid kernel command line. */ - cmd_line = (u64)real_mode->ext_cmd_line_ptr << 32; - cmd_line |= real_mode->hdr.cmd_line_ptr; - /* Calculate size of cmd_line. */ - ptr = (char *)(unsigned long)cmd_line; - for (cmd_line_size = 0; ptr[cmd_line_size++]; ) - ; - mem_avoid[2].start = cmd_line; - mem_avoid[2].size = cmd_line_size; - - /* Avoid heap memory. */ - mem_avoid[3].start = (unsigned long)free_mem_ptr; - mem_avoid[3].size = BOOT_HEAP_SIZE; - - /* Avoid stack memory. */ - mem_avoid[4].start = (unsigned long)free_mem_end_ptr; - mem_avoid[4].size = BOOT_STACK_SIZE; -} - -/* Does this memory vector overlap a known avoided area? */ -static bool mem_avoid_overlap(struct mem_vector *img) -{ - int i; - struct setup_data *ptr; - - for (i = 0; i < MEM_AVOID_MAX; i++) { - if (mem_overlaps(img, &mem_avoid[i])) - return true; - } - - /* Avoid all entries in the setup_data linked list. */ - ptr = (struct setup_data *)(unsigned long)real_mode->hdr.setup_data; - while (ptr) { - struct mem_vector avoid; - - avoid.start = (unsigned long)ptr; - avoid.size = sizeof(*ptr) + ptr->len; - - if (mem_overlaps(img, &avoid)) - return true; - - ptr = (struct setup_data *)(unsigned long)ptr->next; - } - - return false; -} - -static unsigned long slots[CONFIG_RANDOMIZE_BASE_MAX_OFFSET / - CONFIG_PHYSICAL_ALIGN]; -static unsigned long slot_max; - -static void slots_append(unsigned long addr) -{ - /* Overflowing the slots list should be impossible. */ - if (slot_max >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET / - CONFIG_PHYSICAL_ALIGN) - return; - - slots[slot_max++] = addr; -} - -static unsigned long slots_fetch_random(void) -{ - /* Handle case of no slots stored. */ - if (slot_max == 0) - return 0; - - return slots[get_random_long() % slot_max]; -} - -static void process_e820_entry(struct e820entry *entry, - unsigned long minimum, - unsigned long image_size) -{ - struct mem_vector region, img; - - /* Skip non-RAM entries. */ - if (entry->type != E820_RAM) - return; - - /* Ignore entries entirely above our maximum. */ - if (entry->addr >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET) - return; - - /* Ignore entries entirely below our minimum. */ - if (entry->addr + entry->size < minimum) - return; - - region.start = entry->addr; - region.size = entry->size; - - /* Potentially raise address to minimum location. */ - if (region.start < minimum) - region.start = minimum; - - /* Potentially raise address to meet alignment requirements. */ - region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); - - /* Did we raise the address above the bounds of this e820 region? */ - if (region.start > entry->addr + entry->size) - return; - - /* Reduce size by any delta from the original address. */ - region.size -= region.start - entry->addr; - - /* Reduce maximum size to fit end of image within maximum limit. */ - if (region.start + region.size > CONFIG_RANDOMIZE_BASE_MAX_OFFSET) - region.size = CONFIG_RANDOMIZE_BASE_MAX_OFFSET - region.start; - - /* Walk each aligned slot and check for avoided areas. */ - for (img.start = region.start, img.size = image_size ; - mem_contains(®ion, &img) ; - img.start += CONFIG_PHYSICAL_ALIGN) { - if (mem_avoid_overlap(&img)) - continue; - slots_append(img.start); - } -} - -static unsigned long find_random_addr(unsigned long minimum, - unsigned long size) -{ - int i; - unsigned long addr; - - /* Make sure minimum is aligned. */ - minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); - - /* Verify potential e820 positions, appending to slots list. */ - for (i = 0; i < real_mode->e820_entries; i++) { - process_e820_entry(&real_mode->e820_map[i], minimum, size); - } - - return slots_fetch_random(); -} - -unsigned char *choose_kernel_location(struct boot_params *boot_params, - unsigned char *input, - unsigned long input_size, - unsigned char *output, - unsigned long output_size) -{ - unsigned long choice = (unsigned long)output; - unsigned long random; - -#ifdef CONFIG_HIBERNATION - if (!cmdline_find_option_bool("kaslr")) { - debug_putstr("KASLR disabled by default...\n"); - goto out; - } -#else - if (cmdline_find_option_bool("nokaslr")) { - debug_putstr("KASLR disabled by cmdline...\n"); - goto out; - } -#endif - - boot_params->hdr.loadflags |= KASLR_FLAG; - - /* Record the various known unsafe memory ranges. */ - mem_avoid_init((unsigned long)input, input_size, - (unsigned long)output, output_size); - - /* Walk e820 and find a random address. */ - random = find_random_addr(choice, output_size); - if (!random) { - debug_putstr("KASLR could not find suitable E820 region...\n"); - goto out; - } - - /* Always enforce the minimum. */ - if (random < choice) - goto out; - - choice = random; -out: - return (unsigned char *)choice; -} diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c index b68e3033e6b9..73ccf63b0f48 100644 --- a/arch/x86/boot/compressed/cmdline.c +++ b/arch/x86/boot/compressed/cmdline.c @@ -15,9 +15,9 @@ static inline char rdfs8(addr_t addr) #include "../cmdline.c" static unsigned long get_cmd_line_ptr(void) { - unsigned long cmd_line_ptr = real_mode->hdr.cmd_line_ptr; + unsigned long cmd_line_ptr = boot_params->hdr.cmd_line_ptr; - cmd_line_ptr |= (u64)real_mode->ext_cmd_line_ptr << 32; + cmd_line_ptr |= (u64)boot_params->ext_cmd_line_ptr << 32; return cmd_line_ptr; } diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 583d539a4197..52fef606bc54 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -571,312 +571,6 @@ free_handle: efi_call_early(free_pool, pci_handle); } -static void -setup_pixel_info(struct screen_info *si, u32 pixels_per_scan_line, - struct efi_pixel_bitmask pixel_info, int pixel_format) -{ - if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) { - si->lfb_depth = 32; - si->lfb_linelength = pixels_per_scan_line * 4; - si->red_size = 8; - si->red_pos = 0; - si->green_size = 8; - si->green_pos = 8; - si->blue_size = 8; - si->blue_pos = 16; - si->rsvd_size = 8; - si->rsvd_pos = 24; - } else if (pixel_format == PIXEL_BGR_RESERVED_8BIT_PER_COLOR) { - si->lfb_depth = 32; - si->lfb_linelength = pixels_per_scan_line * 4; - si->red_size = 8; - si->red_pos = 16; - si->green_size = 8; - si->green_pos = 8; - si->blue_size = 8; - si->blue_pos = 0; - si->rsvd_size = 8; - si->rsvd_pos = 24; - } else if (pixel_format == PIXEL_BIT_MASK) { - find_bits(pixel_info.red_mask, &si->red_pos, &si->red_size); - find_bits(pixel_info.green_mask, &si->green_pos, - &si->green_size); - find_bits(pixel_info.blue_mask, &si->blue_pos, &si->blue_size); - find_bits(pixel_info.reserved_mask, &si->rsvd_pos, - &si->rsvd_size); - si->lfb_depth = si->red_size + si->green_size + - si->blue_size + si->rsvd_size; - si->lfb_linelength = (pixels_per_scan_line * si->lfb_depth) / 8; - } else { - si->lfb_depth = 4; - si->lfb_linelength = si->lfb_width / 2; - si->red_size = 0; - si->red_pos = 0; - si->green_size = 0; - si->green_pos = 0; - si->blue_size = 0; - si->blue_pos = 0; - si->rsvd_size = 0; - si->rsvd_pos = 0; - } -} - -static efi_status_t -__gop_query32(struct efi_graphics_output_protocol_32 *gop32, - struct efi_graphics_output_mode_info **info, - unsigned long *size, u64 *fb_base) -{ - struct efi_graphics_output_protocol_mode_32 *mode; - efi_status_t status; - unsigned long m; - - m = gop32->mode; - mode = (struct efi_graphics_output_protocol_mode_32 *)m; - - status = efi_early->call(gop32->query_mode, gop32, - mode->mode, size, info); - if (status != EFI_SUCCESS) - return status; - - *fb_base = mode->frame_buffer_base; - return status; -} - -static efi_status_t -setup_gop32(struct screen_info *si, efi_guid_t *proto, - unsigned long size, void **gop_handle) -{ - struct efi_graphics_output_protocol_32 *gop32, *first_gop; - unsigned long nr_gops; - u16 width, height; - u32 pixels_per_scan_line; - u32 ext_lfb_base; - u64 fb_base; - struct efi_pixel_bitmask pixel_info; - int pixel_format; - efi_status_t status; - u32 *handles = (u32 *)(unsigned long)gop_handle; - int i; - - first_gop = NULL; - gop32 = NULL; - - nr_gops = size / sizeof(u32); - for (i = 0; i < nr_gops; i++) { - struct efi_graphics_output_mode_info *info = NULL; - efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID; - bool conout_found = false; - void *dummy = NULL; - u32 h = handles[i]; - u64 current_fb_base; - - status = efi_call_early(handle_protocol, h, - proto, (void **)&gop32); - if (status != EFI_SUCCESS) - continue; - - status = efi_call_early(handle_protocol, h, - &conout_proto, &dummy); - if (status == EFI_SUCCESS) - conout_found = true; - - status = __gop_query32(gop32, &info, &size, ¤t_fb_base); - if (status == EFI_SUCCESS && (!first_gop || conout_found)) { - /* - * Systems that use the UEFI Console Splitter may - * provide multiple GOP devices, not all of which are - * backed by real hardware. The workaround is to search - * for a GOP implementing the ConOut protocol, and if - * one isn't found, to just fall back to the first GOP. - */ - width = info->horizontal_resolution; - height = info->vertical_resolution; - pixel_format = info->pixel_format; - pixel_info = info->pixel_information; - pixels_per_scan_line = info->pixels_per_scan_line; - fb_base = current_fb_base; - - /* - * Once we've found a GOP supporting ConOut, - * don't bother looking any further. - */ - first_gop = gop32; - if (conout_found) - break; - } - } - - /* Did we find any GOPs? */ - if (!first_gop) - goto out; - - /* EFI framebuffer */ - si->orig_video_isVGA = VIDEO_TYPE_EFI; - - si->lfb_width = width; - si->lfb_height = height; - si->lfb_base = fb_base; - - ext_lfb_base = (u64)(unsigned long)fb_base >> 32; - if (ext_lfb_base) { - si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE; - si->ext_lfb_base = ext_lfb_base; - } - - si->pages = 1; - - setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format); - - si->lfb_size = si->lfb_linelength * si->lfb_height; - - si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS; -out: - return status; -} - -static efi_status_t -__gop_query64(struct efi_graphics_output_protocol_64 *gop64, - struct efi_graphics_output_mode_info **info, - unsigned long *size, u64 *fb_base) -{ - struct efi_graphics_output_protocol_mode_64 *mode; - efi_status_t status; - unsigned long m; - - m = gop64->mode; - mode = (struct efi_graphics_output_protocol_mode_64 *)m; - - status = efi_early->call(gop64->query_mode, gop64, - mode->mode, size, info); - if (status != EFI_SUCCESS) - return status; - - *fb_base = mode->frame_buffer_base; - return status; -} - -static efi_status_t -setup_gop64(struct screen_info *si, efi_guid_t *proto, - unsigned long size, void **gop_handle) -{ - struct efi_graphics_output_protocol_64 *gop64, *first_gop; - unsigned long nr_gops; - u16 width, height; - u32 pixels_per_scan_line; - u32 ext_lfb_base; - u64 fb_base; - struct efi_pixel_bitmask pixel_info; - int pixel_format; - efi_status_t status; - u64 *handles = (u64 *)(unsigned long)gop_handle; - int i; - - first_gop = NULL; - gop64 = NULL; - - nr_gops = size / sizeof(u64); - for (i = 0; i < nr_gops; i++) { - struct efi_graphics_output_mode_info *info = NULL; - efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID; - bool conout_found = false; - void *dummy = NULL; - u64 h = handles[i]; - u64 current_fb_base; - - status = efi_call_early(handle_protocol, h, - proto, (void **)&gop64); - if (status != EFI_SUCCESS) - continue; - - status = efi_call_early(handle_protocol, h, - &conout_proto, &dummy); - if (status == EFI_SUCCESS) - conout_found = true; - - status = __gop_query64(gop64, &info, &size, ¤t_fb_base); - if (status == EFI_SUCCESS && (!first_gop || conout_found)) { - /* - * Systems that use the UEFI Console Splitter may - * provide multiple GOP devices, not all of which are - * backed by real hardware. The workaround is to search - * for a GOP implementing the ConOut protocol, and if - * one isn't found, to just fall back to the first GOP. - */ - width = info->horizontal_resolution; - height = info->vertical_resolution; - pixel_format = info->pixel_format; - pixel_info = info->pixel_information; - pixels_per_scan_line = info->pixels_per_scan_line; - fb_base = current_fb_base; - - /* - * Once we've found a GOP supporting ConOut, - * don't bother looking any further. - */ - first_gop = gop64; - if (conout_found) - break; - } - } - - /* Did we find any GOPs? */ - if (!first_gop) - goto out; - - /* EFI framebuffer */ - si->orig_video_isVGA = VIDEO_TYPE_EFI; - - si->lfb_width = width; - si->lfb_height = height; - si->lfb_base = fb_base; - - ext_lfb_base = (u64)(unsigned long)fb_base >> 32; - if (ext_lfb_base) { - si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE; - si->ext_lfb_base = ext_lfb_base; - } - - si->pages = 1; - - setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format); - - si->lfb_size = si->lfb_linelength * si->lfb_height; - - si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS; -out: - return status; -} - -/* - * See if we have Graphics Output Protocol - */ -static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto, - unsigned long size) -{ - efi_status_t status; - void **gop_handle = NULL; - - status = efi_call_early(allocate_pool, EFI_LOADER_DATA, - size, (void **)&gop_handle); - if (status != EFI_SUCCESS) - return status; - - status = efi_call_early(locate_handle, - EFI_LOCATE_BY_PROTOCOL, - proto, NULL, &size, gop_handle); - if (status != EFI_SUCCESS) - goto free_handle; - - if (efi_early->is64) - status = setup_gop64(si, proto, size, gop_handle); - else - status = setup_gop32(si, proto, size, gop_handle); - -free_handle: - efi_call_early(free_pool, gop_handle); - return status; -} - static efi_status_t setup_uga32(void **uga_handle, unsigned long size, u32 *width, u32 *height) { @@ -1038,7 +732,7 @@ void setup_graphics(struct boot_params *boot_params) EFI_LOCATE_BY_PROTOCOL, &graphics_proto, NULL, &size, gop_handle); if (status == EFI_BUFFER_TOO_SMALL) - status = setup_gop(si, &graphics_proto, size); + status = efi_setup_gop(NULL, si, &graphics_proto, size); if (status != EFI_SUCCESS) { size = 0; diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h index d487e727f1ec..c0223f1a89d7 100644 --- a/arch/x86/boot/compressed/eboot.h +++ b/arch/x86/boot/compressed/eboot.h @@ -11,80 +11,6 @@ #define DESC_TYPE_CODE_DATA (1 << 0) -#define EFI_CONSOLE_OUT_DEVICE_GUID \ - EFI_GUID(0xd3b36f2c, 0xd551, 0x11d4, 0x9a, 0x46, 0x0, 0x90, 0x27, \ - 0x3f, 0xc1, 0x4d) - -#define PIXEL_RGB_RESERVED_8BIT_PER_COLOR 0 -#define PIXEL_BGR_RESERVED_8BIT_PER_COLOR 1 -#define PIXEL_BIT_MASK 2 -#define PIXEL_BLT_ONLY 3 -#define PIXEL_FORMAT_MAX 4 - -struct efi_pixel_bitmask { - u32 red_mask; - u32 green_mask; - u32 blue_mask; - u32 reserved_mask; -}; - -struct efi_graphics_output_mode_info { - u32 version; - u32 horizontal_resolution; - u32 vertical_resolution; - int pixel_format; - struct efi_pixel_bitmask pixel_information; - u32 pixels_per_scan_line; -} __packed; - -struct efi_graphics_output_protocol_mode_32 { - u32 max_mode; - u32 mode; - u32 info; - u32 size_of_info; - u64 frame_buffer_base; - u32 frame_buffer_size; -} __packed; - -struct efi_graphics_output_protocol_mode_64 { - u32 max_mode; - u32 mode; - u64 info; - u64 size_of_info; - u64 frame_buffer_base; - u64 frame_buffer_size; -} __packed; - -struct efi_graphics_output_protocol_mode { - u32 max_mode; - u32 mode; - unsigned long info; - unsigned long size_of_info; - u64 frame_buffer_base; - unsigned long frame_buffer_size; -} __packed; - -struct efi_graphics_output_protocol_32 { - u32 query_mode; - u32 set_mode; - u32 blt; - u32 mode; -}; - -struct efi_graphics_output_protocol_64 { - u64 query_mode; - u64 set_mode; - u64 blt; - u64 mode; -}; - -struct efi_graphics_output_protocol { - void *query_mode; - unsigned long set_mode; - unsigned long blt; - struct efi_graphics_output_protocol_mode *mode; -}; - struct efi_uga_draw_protocol_32 { u32 get_mode; u32 set_mode; diff --git a/arch/x86/boot/compressed/error.c b/arch/x86/boot/compressed/error.c new file mode 100644 index 000000000000..6248740b68b5 --- /dev/null +++ b/arch/x86/boot/compressed/error.c @@ -0,0 +1,22 @@ +/* + * Callers outside of misc.c need access to the error reporting routines, + * but the *_putstr() functions need to stay in misc.c because of how + * memcpy() and memmove() are defined for the compressed boot environment. + */ +#include "misc.h" + +void warn(char *m) +{ + error_putstr("\n\n"); + error_putstr(m); + error_putstr("\n\n"); +} + +void error(char *m) +{ + warn(m); + error_putstr(" -- System halted"); + + while (1) + asm("hlt"); +} diff --git a/arch/x86/boot/compressed/error.h b/arch/x86/boot/compressed/error.h new file mode 100644 index 000000000000..2e59dac07f9e --- /dev/null +++ b/arch/x86/boot/compressed/error.h @@ -0,0 +1,7 @@ +#ifndef BOOT_COMPRESSED_ERROR_H +#define BOOT_COMPRESSED_ERROR_H + +void warn(char *m); +void error(char *m); + +#endif /* BOOT_COMPRESSED_ERROR_H */ diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 0256064da8da..1038524270e7 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -176,7 +176,9 @@ preferred_addr: 1: /* Target address to relocate to for decompression */ - addl $z_extract_offset, %ebx + movl BP_init_size(%esi), %eax + subl $_end, %eax + addl %eax, %ebx /* Set up the stack */ leal boot_stack_end(%ebx), %esp @@ -233,24 +235,28 @@ relocated: 2: /* - * Do the decompression, and jump to the new kernel.. + * Do the extraction, and jump to the new kernel.. */ - /* push arguments for decompress_kernel: */ - pushl $z_run_size /* size of kernel with .bss and .brk */ + /* push arguments for extract_kernel: */ pushl $z_output_len /* decompressed length, end of relocs */ - leal z_extract_offset_negative(%ebx), %ebp + + movl BP_init_size(%esi), %eax + subl $_end, %eax + movl %ebx, %ebp + subl %eax, %ebp pushl %ebp /* output address */ + pushl $z_input_len /* input_len */ leal input_data(%ebx), %eax pushl %eax /* input_data */ leal boot_heap(%ebx), %eax pushl %eax /* heap area */ pushl %esi /* real mode pointer */ - call decompress_kernel /* returns kernel location in %eax */ - addl $28, %esp + call extract_kernel /* returns kernel location in %eax */ + addl $24, %esp /* - * Jump to the decompressed kernel. + * Jump to the extracted kernel. */ xorl %ebx, %ebx jmp *%eax diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 86558a199139..0d80a7ad65cd 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -110,7 +110,9 @@ ENTRY(startup_32) 1: /* Target address to relocate to for decompression */ - addl $z_extract_offset, %ebx + movl BP_init_size(%esi), %eax + subl $_end, %eax + addl %eax, %ebx /* * Prepare for entering 64 bit mode @@ -132,7 +134,7 @@ ENTRY(startup_32) /* Initialize Page tables to 0 */ leal pgtable(%ebx), %edi xorl %eax, %eax - movl $((4096*6)/4), %ecx + movl $(BOOT_INIT_PGT_SIZE/4), %ecx rep stosl /* Build Level 4 */ @@ -338,7 +340,9 @@ preferred_addr: 1: /* Target address to relocate to for decompression */ - leaq z_extract_offset(%rbp), %rbx + movl BP_init_size(%rsi), %ebx + subl $_end, %ebx + addq %rbp, %rbx /* Set up the stack */ leaq boot_stack_end(%rbx), %rsp @@ -408,19 +412,16 @@ relocated: 2: /* - * Do the decompression, and jump to the new kernel.. + * Do the extraction, and jump to the new kernel.. */ pushq %rsi /* Save the real mode argument */ - movq $z_run_size, %r9 /* size of kernel with .bss and .brk */ - pushq %r9 movq %rsi, %rdi /* real mode address */ leaq boot_heap(%rip), %rsi /* malloc area for uncompression */ leaq input_data(%rip), %rdx /* input_data */ movl $z_input_len, %ecx /* input_len */ movq %rbp, %r8 /* output target address */ movq $z_output_len, %r9 /* decompressed length, end of relocs */ - call decompress_kernel /* returns kernel location in %rax */ - popq %r9 + call extract_kernel /* returns kernel location in %rax */ popq %rsi /* @@ -485,4 +486,4 @@ boot_stack_end: .section ".pgtable","a",@nobits .balign 4096 pgtable: - .fill 6*4096, 1, 0 + .fill BOOT_PGT_SIZE, 1, 0 diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c new file mode 100644 index 000000000000..cfeb0259ed81 --- /dev/null +++ b/arch/x86/boot/compressed/kaslr.c @@ -0,0 +1,510 @@ +/* + * kaslr.c + * + * This contains the routines needed to generate a reasonable level of + * entropy to choose a randomized kernel base address offset in support + * of Kernel Address Space Layout Randomization (KASLR). Additionally + * handles walking the physical memory maps (and tracking memory regions + * to avoid) in order to select a physical memory location that can + * contain the entire properly aligned running kernel image. + * + */ +#include "misc.h" +#include "error.h" + +#include <asm/msr.h> +#include <asm/archrandom.h> +#include <asm/e820.h> + +#include <generated/compile.h> +#include <linux/module.h> +#include <linux/uts.h> +#include <linux/utsname.h> +#include <generated/utsrelease.h> + +/* Simplified build-specific string for starting entropy. */ +static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" + LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; + +#define I8254_PORT_CONTROL 0x43 +#define I8254_PORT_COUNTER0 0x40 +#define I8254_CMD_READBACK 0xC0 +#define I8254_SELECT_COUNTER0 0x02 +#define I8254_STATUS_NOTREADY 0x40 +static inline u16 i8254(void) +{ + u16 status, timer; + + do { + outb(I8254_PORT_CONTROL, + I8254_CMD_READBACK | I8254_SELECT_COUNTER0); + status = inb(I8254_PORT_COUNTER0); + timer = inb(I8254_PORT_COUNTER0); + timer |= inb(I8254_PORT_COUNTER0) << 8; + } while (status & I8254_STATUS_NOTREADY); + + return timer; +} + +static unsigned long rotate_xor(unsigned long hash, const void *area, + size_t size) +{ + size_t i; + unsigned long *ptr = (unsigned long *)area; + + for (i = 0; i < size / sizeof(hash); i++) { + /* Rotate by odd number of bits and XOR. */ + hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7); + hash ^= ptr[i]; + } + + return hash; +} + +/* Attempt to create a simple but unpredictable starting entropy. */ +static unsigned long get_random_boot(void) +{ + unsigned long hash = 0; + + hash = rotate_xor(hash, build_str, sizeof(build_str)); + hash = rotate_xor(hash, boot_params, sizeof(*boot_params)); + + return hash; +} + +static unsigned long get_random_long(const char *purpose) +{ +#ifdef CONFIG_X86_64 + const unsigned long mix_const = 0x5d6008cbf3848dd3UL; +#else + const unsigned long mix_const = 0x3f39e593UL; +#endif + unsigned long raw, random = get_random_boot(); + bool use_i8254 = true; + + debug_putstr(purpose); + debug_putstr(" KASLR using"); + + if (has_cpuflag(X86_FEATURE_RDRAND)) { + debug_putstr(" RDRAND"); + if (rdrand_long(&raw)) { + random ^= raw; + use_i8254 = false; + } + } + + if (has_cpuflag(X86_FEATURE_TSC)) { + debug_putstr(" RDTSC"); + raw = rdtsc(); + + random ^= raw; + use_i8254 = false; + } + + if (use_i8254) { + debug_putstr(" i8254"); + random ^= i8254(); + } + + /* Circular multiply for better bit diffusion */ + asm("mul %3" + : "=a" (random), "=d" (raw) + : "a" (random), "rm" (mix_const)); + random += raw; + + debug_putstr("...\n"); + + return random; +} + +struct mem_vector { + unsigned long start; + unsigned long size; +}; + +enum mem_avoid_index { + MEM_AVOID_ZO_RANGE = 0, + MEM_AVOID_INITRD, + MEM_AVOID_CMDLINE, + MEM_AVOID_BOOTPARAMS, + MEM_AVOID_MAX, +}; + +static struct mem_vector mem_avoid[MEM_AVOID_MAX]; + +static bool mem_contains(struct mem_vector *region, struct mem_vector *item) +{ + /* Item at least partially before region. */ + if (item->start < region->start) + return false; + /* Item at least partially after region. */ + if (item->start + item->size > region->start + region->size) + return false; + return true; +} + +static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) +{ + /* Item one is entirely before item two. */ + if (one->start + one->size <= two->start) + return false; + /* Item one is entirely after item two. */ + if (one->start >= two->start + two->size) + return false; + return true; +} + +/* + * In theory, KASLR can put the kernel anywhere in the range of [16M, 64T). + * The mem_avoid array is used to store the ranges that need to be avoided + * when KASLR searches for an appropriate random address. We must avoid any + * regions that are unsafe to overlap with during decompression, and other + * things like the initrd, cmdline and boot_params. This comment seeks to + * explain mem_avoid as clearly as possible since incorrect mem_avoid + * memory ranges lead to really hard to debug boot failures. + * + * The initrd, cmdline, and boot_params are trivial to identify for + * avoiding. They are MEM_AVOID_INITRD, MEM_AVOID_CMDLINE, and + * MEM_AVOID_BOOTPARAMS respectively below. + * + * What is not obvious how to avoid is the range of memory that is used + * during decompression (MEM_AVOID_ZO_RANGE below). This range must cover + * the compressed kernel (ZO) and its run space, which is used to extract + * the uncompressed kernel (VO) and relocs. + * + * ZO's full run size sits against the end of the decompression buffer, so + * we can calculate where text, data, bss, etc of ZO are positioned more + * easily. + * + * For additional background, the decompression calculations can be found + * in header.S, and the memory diagram is based on the one found in misc.c. + * + * The following conditions are already enforced by the image layouts and + * associated code: + * - input + input_size >= output + output_size + * - kernel_total_size <= init_size + * - kernel_total_size <= output_size (see Note below) + * - output + init_size >= output + output_size + * + * (Note that kernel_total_size and output_size have no fundamental + * relationship, but output_size is passed to choose_random_location + * as a maximum of the two. The diagram is showing a case where + * kernel_total_size is larger than output_size, but this case is + * handled by bumping output_size.) + * + * The above conditions can be illustrated by a diagram: + * + * 0 output input input+input_size output+init_size + * | | | | | + * | | | | | + * |-----|--------|--------|--------------|-----------|--|-------------| + * | | | + * | | | + * output+init_size-ZO_INIT_SIZE output+output_size output+kernel_total_size + * + * [output, output+init_size) is the entire memory range used for + * extracting the compressed image. + * + * [output, output+kernel_total_size) is the range needed for the + * uncompressed kernel (VO) and its run size (bss, brk, etc). + * + * [output, output+output_size) is VO plus relocs (i.e. the entire + * uncompressed payload contained by ZO). This is the area of the buffer + * written to during decompression. + * + * [output+init_size-ZO_INIT_SIZE, output+init_size) is the worst-case + * range of the copied ZO and decompression code. (i.e. the range + * covered backwards of size ZO_INIT_SIZE, starting from output+init_size.) + * + * [input, input+input_size) is the original copied compressed image (ZO) + * (i.e. it does not include its run size). This range must be avoided + * because it contains the data used for decompression. + * + * [input+input_size, output+init_size) is [_text, _end) for ZO. This + * range includes ZO's heap and stack, and must be avoided since it + * performs the decompression. + * + * Since the above two ranges need to be avoided and they are adjacent, + * they can be merged, resulting in: [input, output+init_size) which + * becomes the MEM_AVOID_ZO_RANGE below. + */ +static void mem_avoid_init(unsigned long input, unsigned long input_size, + unsigned long output) +{ + unsigned long init_size = boot_params->hdr.init_size; + u64 initrd_start, initrd_size; + u64 cmd_line, cmd_line_size; + char *ptr; + + /* + * Avoid the region that is unsafe to overlap during + * decompression. + */ + mem_avoid[MEM_AVOID_ZO_RANGE].start = input; + mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input; + add_identity_map(mem_avoid[MEM_AVOID_ZO_RANGE].start, + mem_avoid[MEM_AVOID_ZO_RANGE].size); + + /* Avoid initrd. */ + initrd_start = (u64)boot_params->ext_ramdisk_image << 32; + initrd_start |= boot_params->hdr.ramdisk_image; + initrd_size = (u64)boot_params->ext_ramdisk_size << 32; + initrd_size |= boot_params->hdr.ramdisk_size; + mem_avoid[MEM_AVOID_INITRD].start = initrd_start; + mem_avoid[MEM_AVOID_INITRD].size = initrd_size; + /* No need to set mapping for initrd, it will be handled in VO. */ + + /* Avoid kernel command line. */ + cmd_line = (u64)boot_params->ext_cmd_line_ptr << 32; + cmd_line |= boot_params->hdr.cmd_line_ptr; + /* Calculate size of cmd_line. */ + ptr = (char *)(unsigned long)cmd_line; + for (cmd_line_size = 0; ptr[cmd_line_size++]; ) + ; + mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line; + mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size; + add_identity_map(mem_avoid[MEM_AVOID_CMDLINE].start, + mem_avoid[MEM_AVOID_CMDLINE].size); + + /* Avoid boot parameters. */ + mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params; + mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params); + add_identity_map(mem_avoid[MEM_AVOID_BOOTPARAMS].start, + mem_avoid[MEM_AVOID_BOOTPARAMS].size); + + /* We don't need to set a mapping for setup_data. */ + +#ifdef CONFIG_X86_VERBOSE_BOOTUP + /* Make sure video RAM can be used. */ + add_identity_map(0, PMD_SIZE); +#endif +} + +/* + * Does this memory vector overlap a known avoided area? If so, record the + * overlap region with the lowest address. + */ +static bool mem_avoid_overlap(struct mem_vector *img, + struct mem_vector *overlap) +{ + int i; + struct setup_data *ptr; + unsigned long earliest = img->start + img->size; + bool is_overlapping = false; + + for (i = 0; i < MEM_AVOID_MAX; i++) { + if (mem_overlaps(img, &mem_avoid[i]) && + mem_avoid[i].start < earliest) { + *overlap = mem_avoid[i]; + is_overlapping = true; + } + } + + /* Avoid all entries in the setup_data linked list. */ + ptr = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data; + while (ptr) { + struct mem_vector avoid; + + avoid.start = (unsigned long)ptr; + avoid.size = sizeof(*ptr) + ptr->len; + + if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) { + *overlap = avoid; + is_overlapping = true; + } + + ptr = (struct setup_data *)(unsigned long)ptr->next; + } + + return is_overlapping; +} + +static unsigned long slots[KERNEL_IMAGE_SIZE / CONFIG_PHYSICAL_ALIGN]; + +struct slot_area { + unsigned long addr; + int num; +}; + +#define MAX_SLOT_AREA 100 + +static struct slot_area slot_areas[MAX_SLOT_AREA]; + +static unsigned long slot_max; + +static unsigned long slot_area_index; + +static void store_slot_info(struct mem_vector *region, unsigned long image_size) +{ + struct slot_area slot_area; + + if (slot_area_index == MAX_SLOT_AREA) + return; + + slot_area.addr = region->start; + slot_area.num = (region->size - image_size) / + CONFIG_PHYSICAL_ALIGN + 1; + + if (slot_area.num > 0) { + slot_areas[slot_area_index++] = slot_area; + slot_max += slot_area.num; + } +} + +static void slots_append(unsigned long addr) +{ + /* Overflowing the slots list should be impossible. */ + if (slot_max >= KERNEL_IMAGE_SIZE / CONFIG_PHYSICAL_ALIGN) + return; + + slots[slot_max++] = addr; +} + +static unsigned long slots_fetch_random(void) +{ + /* Handle case of no slots stored. */ + if (slot_max == 0) + return 0; + + return slots[get_random_long("Physical") % slot_max]; +} + +static void process_e820_entry(struct e820entry *entry, + unsigned long minimum, + unsigned long image_size) +{ + struct mem_vector region, img, overlap; + + /* Skip non-RAM entries. */ + if (entry->type != E820_RAM) + return; + + /* Ignore entries entirely above our maximum. */ + if (entry->addr >= KERNEL_IMAGE_SIZE) + return; + + /* Ignore entries entirely below our minimum. */ + if (entry->addr + entry->size < minimum) + return; + + region.start = entry->addr; + region.size = entry->size; + + /* Potentially raise address to minimum location. */ + if (region.start < minimum) + region.start = minimum; + + /* Potentially raise address to meet alignment requirements. */ + region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); + + /* Did we raise the address above the bounds of this e820 region? */ + if (region.start > entry->addr + entry->size) + return; + + /* Reduce size by any delta from the original address. */ + region.size -= region.start - entry->addr; + + /* Reduce maximum size to fit end of image within maximum limit. */ + if (region.start + region.size > KERNEL_IMAGE_SIZE) + region.size = KERNEL_IMAGE_SIZE - region.start; + + /* Walk each aligned slot and check for avoided areas. */ + for (img.start = region.start, img.size = image_size ; + mem_contains(®ion, &img) ; + img.start += CONFIG_PHYSICAL_ALIGN) { + if (mem_avoid_overlap(&img, &overlap)) + continue; + slots_append(img.start); + } +} + +static unsigned long find_random_phys_addr(unsigned long minimum, + unsigned long image_size) +{ + int i; + unsigned long addr; + + /* Make sure minimum is aligned. */ + minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); + + /* Verify potential e820 positions, appending to slots list. */ + for (i = 0; i < boot_params->e820_entries; i++) { + process_e820_entry(&boot_params->e820_map[i], minimum, + image_size); + } + + return slots_fetch_random(); +} + +static unsigned long find_random_virt_addr(unsigned long minimum, + unsigned long image_size) +{ + unsigned long slots, random_addr; + + /* Make sure minimum is aligned. */ + minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); + /* Align image_size for easy slot calculations. */ + image_size = ALIGN(image_size, CONFIG_PHYSICAL_ALIGN); + + /* + * There are how many CONFIG_PHYSICAL_ALIGN-sized slots + * that can hold image_size within the range of minimum to + * KERNEL_IMAGE_SIZE? + */ + slots = (KERNEL_IMAGE_SIZE - minimum - image_size) / + CONFIG_PHYSICAL_ALIGN + 1; + + random_addr = get_random_long("Virtual") % slots; + + return random_addr * CONFIG_PHYSICAL_ALIGN + minimum; +} + +/* + * Since this function examines addresses much more numerically, + * it takes the input and output pointers as 'unsigned long'. + */ +unsigned char *choose_random_location(unsigned long input, + unsigned long input_size, + unsigned long output, + unsigned long output_size) +{ + unsigned long choice = output; + unsigned long random_addr; + +#ifdef CONFIG_HIBERNATION + if (!cmdline_find_option_bool("kaslr")) { + warn("KASLR disabled: 'kaslr' not on cmdline (hibernation selected)."); + goto out; + } +#else + if (cmdline_find_option_bool("nokaslr")) { + warn("KASLR disabled: 'nokaslr' on cmdline."); + goto out; + } +#endif + + boot_params->hdr.loadflags |= KASLR_FLAG; + + /* Record the various known unsafe memory ranges. */ + mem_avoid_init(input, input_size, output); + + /* Walk e820 and find a random address. */ + random_addr = find_random_phys_addr(output, output_size); + if (!random_addr) { + warn("KASLR disabled: could not find suitable E820 region!"); + goto out; + } + + /* Always enforce the minimum. */ + if (random_addr < choice) + goto out; + + choice = random_addr; + + add_identity_map(choice, output_size); + + /* This actually loads the identity pagetable on x86_64. */ + finalize_identity_maps(); +out: + return (unsigned char *)choice; +} diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 79dac1758e7c..f14db4e21654 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -1,8 +1,10 @@ /* * misc.c * - * This is a collection of several routines from gzip-1.0.3 - * adapted for Linux. + * This is a collection of several routines used to extract the kernel + * which includes KASLR relocation, decompression, ELF parsing, and + * relocation processing. Additionally included are the screen and serial + * output functions and related debugging support functions. * * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994 * puts by Nick Holloway 1993, better puts by Martin Mares 1995 @@ -10,111 +12,37 @@ */ #include "misc.h" +#include "error.h" #include "../string.h" - -/* WARNING!! - * This code is compiled with -fPIC and it is relocated dynamically - * at run time, but no relocation processing is performed. - * This means that it is not safe to place pointers in static structures. - */ +#include "../voffset.h" /* - * Getting to provable safe in place decompression is hard. - * Worst case behaviours need to be analyzed. - * Background information: - * - * The file layout is: - * magic[2] - * method[1] - * flags[1] - * timestamp[4] - * extraflags[1] - * os[1] - * compressed data blocks[N] - * crc[4] orig_len[4] - * - * resulting in 18 bytes of non compressed data overhead. - * - * Files divided into blocks - * 1 bit (last block flag) - * 2 bits (block type) - * - * 1 block occurs every 32K -1 bytes or when there 50% compression - * has been achieved. The smallest block type encoding is always used. - * - * stored: - * 32 bits length in bytes. - * - * fixed: - * magic fixed tree. - * symbols. - * - * dynamic: - * dynamic tree encoding. - * symbols. - * - * - * The buffer for decompression in place is the length of the - * uncompressed data, plus a small amount extra to keep the algorithm safe. - * The compressed data is placed at the end of the buffer. The output - * pointer is placed at the start of the buffer and the input pointer - * is placed where the compressed data starts. Problems will occur - * when the output pointer overruns the input pointer. - * - * The output pointer can only overrun the input pointer if the input - * pointer is moving faster than the output pointer. A condition only - * triggered by data whose compressed form is larger than the uncompressed - * form. - * - * The worst case at the block level is a growth of the compressed data - * of 5 bytes per 32767 bytes. - * - * The worst case internal to a compressed block is very hard to figure. - * The worst case can at least be boundined by having one bit that represents - * 32764 bytes and then all of the rest of the bytes representing the very - * very last byte. - * - * All of which is enough to compute an amount of extra data that is required - * to be safe. To avoid problems at the block level allocating 5 extra bytes - * per 32767 bytes of data is sufficient. To avoind problems internal to a - * block adding an extra 32767 bytes (the worst case uncompressed block size) - * is sufficient, to ensure that in the worst case the decompressed data for - * block will stop the byte before the compressed data for a block begins. - * To avoid problems with the compressed data's meta information an extra 18 - * bytes are needed. Leading to the formula: - * - * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size. - * - * Adding 8 bytes per 32K is a bit excessive but much easier to calculate. - * Adding 32768 instead of 32767 just makes for round numbers. - * Adding the decompressor_size is necessary as it musht live after all - * of the data as well. Last I measured the decompressor is about 14K. - * 10K of actual data and 4K of bss. - * + * WARNING!! + * This code is compiled with -fPIC and it is relocated dynamically at + * run time, but no relocation processing is performed. This means that + * it is not safe to place pointers in static structures. */ -/* - * gzip declarations - */ +/* Macros used by the included decompressor code below. */ #define STATIC static -#undef memcpy - /* - * Use a normal definition of memset() from string.c. There are already + * Use normal definitions of mem*() from string.c. There are already * included header files which expect a definition of memset() and by * the time we define memset macro, it is too late. */ +#undef memcpy #undef memset #define memzero(s, n) memset((s), 0, (n)) +#define memmove memmove - -static void error(char *m); +/* Functions used by the included decompressor code below. */ +void *memmove(void *dest, const void *src, size_t n); /* * This is set up by the setup-routine at boot-time */ -struct boot_params *real_mode; /* Pointer to real-mode data */ +struct boot_params *boot_params; memptr free_mem_ptr; memptr free_mem_end_ptr; @@ -146,12 +74,16 @@ static int lines, cols; #ifdef CONFIG_KERNEL_LZ4 #include "../../../../lib/decompress_unlz4.c" #endif +/* + * NOTE: When adding a new decompressor, please update the analysis in + * ../header.S. + */ static void scroll(void) { int i; - memcpy(vidmem, vidmem + cols * 2, (lines - 1) * cols * 2); + memmove(vidmem, vidmem + cols * 2, (lines - 1) * cols * 2); for (i = (lines - 1) * cols * 2; i < lines * cols * 2; i += 2) vidmem[i] = ' '; } @@ -184,12 +116,12 @@ void __putstr(const char *s) } } - if (real_mode->screen_info.orig_video_mode == 0 && + if (boot_params->screen_info.orig_video_mode == 0 && lines == 0 && cols == 0) return; - x = real_mode->screen_info.orig_x; - y = real_mode->screen_info.orig_y; + x = boot_params->screen_info.orig_x; + y = boot_params->screen_info.orig_y; while ((c = *s++) != '\0') { if (c == '\n') { @@ -210,8 +142,8 @@ void __putstr(const char *s) } } - real_mode->screen_info.orig_x = x; - real_mode->screen_info.orig_y = y; + boot_params->screen_info.orig_x = x; + boot_params->screen_info.orig_y = y; pos = (x + cols * y) * 2; /* Update cursor position */ outb(14, vidport); @@ -237,23 +169,13 @@ void __puthex(unsigned long value) } } -static void error(char *x) -{ - error_putstr("\n\n"); - error_putstr(x); - error_putstr("\n\n -- System halted"); - - while (1) - asm("hlt"); -} - #if CONFIG_X86_NEED_RELOCS static void handle_relocations(void *output, unsigned long output_len) { int *reloc; unsigned long delta, map, ptr; unsigned long min_addr = (unsigned long)output; - unsigned long max_addr = min_addr + output_len; + unsigned long max_addr = min_addr + (VO___bss_start - VO__text); /* * Calculate the delta between where vmlinux was linked to load @@ -295,7 +217,7 @@ static void handle_relocations(void *output, unsigned long output_len) * So we work backwards from the end of the decompressed image. */ for (reloc = output + output_len - sizeof(*reloc); *reloc; reloc--) { - int extended = *reloc; + long extended = *reloc; extended += map; ptr = (unsigned long)extended; @@ -372,9 +294,7 @@ static void parse_elf(void *output) #else dest = (void *)(phdr->p_paddr); #endif - memcpy(dest, - output + phdr->p_offset, - phdr->p_filesz); + memmove(dest, output + phdr->p_offset, phdr->p_filesz); break; default: /* Ignore other PT_* */ break; } @@ -383,23 +303,41 @@ static void parse_elf(void *output) free(phdrs); } -asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, +/* + * The compressed kernel image (ZO), has been moved so that its position + * is against the end of the buffer used to hold the uncompressed kernel + * image (VO) and the execution environment (.bss, .brk), which makes sure + * there is room to do the in-place decompression. (See header.S for the + * calculations.) + * + * |-----compressed kernel image------| + * V V + * 0 extract_offset +INIT_SIZE + * |-----------|---------------|-------------------------|--------| + * | | | | + * VO__text startup_32 of ZO VO__end ZO__end + * ^ ^ + * |-------uncompressed kernel image---------| + * + */ +asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, unsigned char *input_data, unsigned long input_len, unsigned char *output, - unsigned long output_len, - unsigned long run_size) + unsigned long output_len) { + const unsigned long kernel_total_size = VO__end - VO__text; unsigned char *output_orig = output; - real_mode = rmode; + /* Retain x86 boot parameters pointer passed from startup_32/64. */ + boot_params = rmode; - /* Clear it for solely in-kernel use */ - real_mode->hdr.loadflags &= ~KASLR_FLAG; + /* Clear flags intended for solely in-kernel use. */ + boot_params->hdr.loadflags &= ~KASLR_FLAG; - sanitize_boot_params(real_mode); + sanitize_boot_params(boot_params); - if (real_mode->screen_info.orig_video_mode == 7) { + if (boot_params->screen_info.orig_video_mode == 7) { vidmem = (char *) 0xb0000; vidport = 0x3b4; } else { @@ -407,11 +345,11 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, vidport = 0x3d4; } - lines = real_mode->screen_info.orig_video_lines; - cols = real_mode->screen_info.orig_video_cols; + lines = boot_params->screen_info.orig_video_lines; + cols = boot_params->screen_info.orig_video_cols; console_init(); - debug_putstr("early console in decompress_kernel\n"); + debug_putstr("early console in extract_kernel\n"); free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; @@ -421,16 +359,16 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, debug_putaddr(input_len); debug_putaddr(output); debug_putaddr(output_len); - debug_putaddr(run_size); + debug_putaddr(kernel_total_size); /* * The memory hole needed for the kernel is the larger of either * the entire decompressed kernel plus relocation table, or the * entire decompressed kernel plus .bss and .brk sections. */ - output = choose_kernel_location(real_mode, input_data, input_len, output, - output_len > run_size ? output_len - : run_size); + output = choose_random_location((unsigned long)input_data, input_len, + (unsigned long)output, + max(output_len, kernel_total_size)); /* Validate memory location choices. */ if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1)) diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 3783dc3e10b3..b6fec1ff10e4 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -32,7 +32,7 @@ /* misc.c */ extern memptr free_mem_ptr; extern memptr free_mem_end_ptr; -extern struct boot_params *real_mode; /* Pointer to real-mode data */ +extern struct boot_params *boot_params; void __putstr(const char *s); void __puthex(unsigned long value); #define error_putstr(__x) __putstr(__x) @@ -66,26 +66,35 @@ int cmdline_find_option_bool(const char *option); #if CONFIG_RANDOMIZE_BASE -/* aslr.c */ -unsigned char *choose_kernel_location(struct boot_params *boot_params, - unsigned char *input, +/* kaslr.c */ +unsigned char *choose_random_location(unsigned long input_ptr, unsigned long input_size, - unsigned char *output, + unsigned long output_ptr, unsigned long output_size); /* cpuflags.c */ bool has_cpuflag(int flag); #else static inline -unsigned char *choose_kernel_location(struct boot_params *boot_params, - unsigned char *input, +unsigned char *choose_random_location(unsigned long input_ptr, unsigned long input_size, - unsigned char *output, + unsigned long output_ptr, unsigned long output_size) { - return output; + return (unsigned char *)output_ptr; } #endif +#ifdef CONFIG_X86_64 +void add_identity_map(unsigned long start, unsigned long size); +void finalize_identity_maps(void); +extern unsigned char _pgtable[]; +#else +static inline void add_identity_map(unsigned long start, unsigned long size) +{ } +static inline void finalize_identity_maps(void) +{ } +#endif + #ifdef CONFIG_EARLY_PRINTK /* early_serial_console.c */ extern int early_serial_base; diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c index d8222f213182..72bad2c8debe 100644 --- a/arch/x86/boot/compressed/mkpiggy.c +++ b/arch/x86/boot/compressed/mkpiggy.c @@ -18,11 +18,10 @@ * * H. Peter Anvin <hpa@linux.intel.com> * - * ----------------------------------------------------------------------- */ - -/* - * Compute the desired load offset from a compressed program; outputs - * a small assembly wrapper with the appropriate symbols defined. + * ----------------------------------------------------------------------- + * + * Outputs a small assembly wrapper with the appropriate symbols defined. + * */ #include <stdlib.h> @@ -35,14 +34,11 @@ int main(int argc, char *argv[]) { uint32_t olen; long ilen; - unsigned long offs; - unsigned long run_size; FILE *f = NULL; int retval = 1; - if (argc < 3) { - fprintf(stderr, "Usage: %s compressed_file run_size\n", - argv[0]); + if (argc < 2) { + fprintf(stderr, "Usage: %s compressed_file\n", argv[0]); goto bail; } @@ -67,29 +63,11 @@ int main(int argc, char *argv[]) ilen = ftell(f); olen = get_unaligned_le32(&olen); - /* - * Now we have the input (compressed) and output (uncompressed) - * sizes, compute the necessary decompression offset... - */ - - offs = (olen > ilen) ? olen - ilen : 0; - offs += olen >> 12; /* Add 8 bytes for each 32K block */ - offs += 64*1024 + 128; /* Add 64K + 128 bytes slack */ - offs = (offs+4095) & ~4095; /* Round to a 4K boundary */ - run_size = atoi(argv[2]); - printf(".section \".rodata..compressed\",\"a\",@progbits\n"); printf(".globl z_input_len\n"); printf("z_input_len = %lu\n", ilen); printf(".globl z_output_len\n"); printf("z_output_len = %lu\n", (unsigned long)olen); - printf(".globl z_extract_offset\n"); - printf("z_extract_offset = 0x%lx\n", offs); - /* z_extract_offset_negative allows simplification of head_32.S */ - printf(".globl z_extract_offset_negative\n"); - printf("z_extract_offset_negative = -0x%lx\n", offs); - printf(".globl z_run_size\n"); - printf("z_run_size = %lu\n", run_size); printf(".globl input_data, input_data_end\n"); printf("input_data:\n"); diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c new file mode 100644 index 000000000000..34b95df14e69 --- /dev/null +++ b/arch/x86/boot/compressed/pagetable.c @@ -0,0 +1,129 @@ +/* + * This code is used on x86_64 to create page table identity mappings on + * demand by building up a new set of page tables (or appending to the + * existing ones), and then switching over to them when ready. + */ + +/* + * Since we're dealing with identity mappings, physical and virtual + * addresses are the same, so override these defines which are ultimately + * used by the headers in misc.h. + */ +#define __pa(x) ((unsigned long)(x)) +#define __va(x) ((void *)((unsigned long)(x))) + +#include "misc.h" + +/* These actually do the work of building the kernel identity maps. */ +#include <asm/init.h> +#include <asm/pgtable.h> +#include "../../mm/ident_map.c" + +/* Used by pgtable.h asm code to force instruction serialization. */ +unsigned long __force_order; + +/* Used to track our page table allocation area. */ +struct alloc_pgt_data { + unsigned char *pgt_buf; + unsigned long pgt_buf_size; + unsigned long pgt_buf_offset; +}; + +/* + * Allocates space for a page table entry, using struct alloc_pgt_data + * above. Besides the local callers, this is used as the allocation + * callback in mapping_info below. + */ +static void *alloc_pgt_page(void *context) +{ + struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context; + unsigned char *entry; + + /* Validate there is space available for a new page. */ + if (pages->pgt_buf_offset >= pages->pgt_buf_size) { + debug_putstr("out of pgt_buf in " __FILE__ "!?\n"); + debug_putaddr(pages->pgt_buf_offset); + debug_putaddr(pages->pgt_buf_size); + return NULL; + } + + entry = pages->pgt_buf + pages->pgt_buf_offset; + pages->pgt_buf_offset += PAGE_SIZE; + + return entry; +} + +/* Used to track our allocated page tables. */ +static struct alloc_pgt_data pgt_data; + +/* The top level page table entry pointer. */ +static unsigned long level4p; + +/* Locates and clears a region for a new top level page table. */ +static void prepare_level4(void) +{ + /* + * It should be impossible for this not to already be true, + * but since calling this a second time would rewind the other + * counters, let's just make sure this is reset too. + */ + pgt_data.pgt_buf_offset = 0; + + /* + * If we came here via startup_32(), cr3 will be _pgtable already + * and we must append to the existing area instead of entirely + * overwriting it. + */ + level4p = read_cr3(); + if (level4p == (unsigned long)_pgtable) { + debug_putstr("booted via startup_32()\n"); + pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE; + pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE; + memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); + } else { + debug_putstr("booted via startup_64()\n"); + pgt_data.pgt_buf = _pgtable; + pgt_data.pgt_buf_size = BOOT_PGT_SIZE; + memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); + level4p = (unsigned long)alloc_pgt_page(&pgt_data); + } +} + +/* + * Adds the specified range to what will become the new identity mappings. + * Once all ranges have been added, the new mapping is activated by calling + * finalize_identity_maps() below. + */ +void add_identity_map(unsigned long start, unsigned long size) +{ + struct x86_mapping_info mapping_info = { + .alloc_pgt_page = alloc_pgt_page, + .context = &pgt_data, + .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, + }; + unsigned long end = start + size; + + /* Make sure we have a top level page table ready to use. */ + if (!level4p) + prepare_level4(); + + /* Align boundary to 2M. */ + start = round_down(start, PMD_SIZE); + end = round_up(end, PMD_SIZE); + if (start >= end) + return; + + /* Build the mapping. */ + kernel_ident_mapping_init(&mapping_info, (pgd_t *)level4p, + start, end); +} + +/* + * This switches the page tables to the new level4 that has been built + * via calls to add_identity_map() above. If booted via startup_32(), + * this is effectively a no-op. + */ +void finalize_identity_maps(void) +{ + write_cr3(level4p); +} diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c index 00e788be1db9..cea140ce6b42 100644 --- a/arch/x86/boot/compressed/string.c +++ b/arch/x86/boot/compressed/string.c @@ -1,7 +1,16 @@ +/* + * This provides an optimized implementation of memcpy, and a simplified + * implementation of memset and memmove. These are used here because the + * standard kernel runtime versions are not yet available and we don't + * trust the gcc built-in implementations as they may do unexpected things + * (e.g. FPU ops) in the minimal decompression stub execution environment. + */ +#include "error.h" + #include "../string.c" #ifdef CONFIG_X86_32 -void *memcpy(void *dest, const void *src, size_t n) +static void *__memcpy(void *dest, const void *src, size_t n) { int d0, d1, d2; asm volatile( @@ -15,7 +24,7 @@ void *memcpy(void *dest, const void *src, size_t n) return dest; } #else -void *memcpy(void *dest, const void *src, size_t n) +static void *__memcpy(void *dest, const void *src, size_t n) { long d0, d1, d2; asm volatile( @@ -39,3 +48,27 @@ void *memset(void *s, int c, size_t n) ss[i] = c; return s; } + +void *memmove(void *dest, const void *src, size_t n) +{ + unsigned char *d = dest; + const unsigned char *s = src; + + if (d <= s || d - s >= n) + return __memcpy(dest, src, n); + + while (n-- > 0) + d[n] = s[n]; + + return dest; +} + +/* Detect and warn about potential overlaps, but handle them with memmove. */ +void *memcpy(void *dest, const void *src, size_t n) +{ + if (dest > src && dest - src < n) { + warn("Avoiding potentially unsafe overlapping memcpy()!"); + return memmove(dest, src, n); + } + return __memcpy(dest, src, n); +} diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index 34d047c98284..e24e0a0c90c9 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -70,5 +70,6 @@ SECTIONS _epgtable = . ; } #endif + . = ALIGN(PAGE_SIZE); /* keep ZO size page aligned */ _end = .; } diff --git a/arch/x86/boot/early_serial_console.c b/arch/x86/boot/early_serial_console.c index 45a07684bbab..f0b8d6d93164 100644 --- a/arch/x86/boot/early_serial_console.c +++ b/arch/x86/boot/early_serial_console.c @@ -1,3 +1,7 @@ +/* + * Serial port routines for use during early boot reporting. This code is + * included from both the compressed kernel and the regular kernel. + */ #include "boot.h" #define DEFAULT_SERIAL_PORT 0x3f8 /* ttyS0 */ diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 6236b9ec4b76..3dd5be33aaa7 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -440,13 +440,116 @@ setup_data: .quad 0 # 64-bit physical pointer to pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr -#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_extract_offset) +# +# Getting to provably safe in-place decompression is hard. Worst case +# behaviours need to be analyzed. Here let's take the decompression of +# a gzip-compressed kernel as example, to illustrate it: +# +# The file layout of gzip compressed kernel is: +# +# magic[2] +# method[1] +# flags[1] +# timestamp[4] +# extraflags[1] +# os[1] +# compressed data blocks[N] +# crc[4] orig_len[4] +# +# ... resulting in +18 bytes overhead of uncompressed data. +# +# (For more information, please refer to RFC 1951 and RFC 1952.) +# +# Files divided into blocks +# 1 bit (last block flag) +# 2 bits (block type) +# +# 1 block occurs every 32K -1 bytes or when there 50% compression +# has been achieved. The smallest block type encoding is always used. +# +# stored: +# 32 bits length in bytes. +# +# fixed: +# magic fixed tree. +# symbols. +# +# dynamic: +# dynamic tree encoding. +# symbols. +# +# +# The buffer for decompression in place is the length of the uncompressed +# data, plus a small amount extra to keep the algorithm safe. The +# compressed data is placed at the end of the buffer. The output pointer +# is placed at the start of the buffer and the input pointer is placed +# where the compressed data starts. Problems will occur when the output +# pointer overruns the input pointer. +# +# The output pointer can only overrun the input pointer if the input +# pointer is moving faster than the output pointer. A condition only +# triggered by data whose compressed form is larger than the uncompressed +# form. +# +# The worst case at the block level is a growth of the compressed data +# of 5 bytes per 32767 bytes. +# +# The worst case internal to a compressed block is very hard to figure. +# The worst case can at least be bounded by having one bit that represents +# 32764 bytes and then all of the rest of the bytes representing the very +# very last byte. +# +# All of which is enough to compute an amount of extra data that is required +# to be safe. To avoid problems at the block level allocating 5 extra bytes +# per 32767 bytes of data is sufficient. To avoid problems internal to a +# block adding an extra 32767 bytes (the worst case uncompressed block size) +# is sufficient, to ensure that in the worst case the decompressed data for +# block will stop the byte before the compressed data for a block begins. +# To avoid problems with the compressed data's meta information an extra 18 +# bytes are needed. Leading to the formula: +# +# extra_bytes = (uncompressed_size >> 12) + 32768 + 18 +# +# Adding 8 bytes per 32K is a bit excessive but much easier to calculate. +# Adding 32768 instead of 32767 just makes for round numbers. +# +# Above analysis is for decompressing gzip compressed kernel only. Up to +# now 6 different decompressor are supported all together. And among them +# xz stores data in chunks and has maximum chunk of 64K. Hence safety +# margin should be updated to cover all decompressors so that we don't +# need to deal with each of them separately. Please check +# the description in lib/decompressor_xxx.c for specific information. +# +# extra_bytes = (uncompressed_size >> 12) + 65536 + 128 + +#define ZO_z_extra_bytes ((ZO_z_output_len >> 12) + 65536 + 128) +#if ZO_z_output_len > ZO_z_input_len +# define ZO_z_extract_offset (ZO_z_output_len + ZO_z_extra_bytes - \ + ZO_z_input_len) +#else +# define ZO_z_extract_offset ZO_z_extra_bytes +#endif + +/* + * The extract_offset has to be bigger than ZO head section. Otherwise when + * the head code is running to move ZO to the end of the buffer, it will + * overwrite the head code itself. + */ +#if (ZO__ehead - ZO_startup_32) > ZO_z_extract_offset +# define ZO_z_min_extract_offset ((ZO__ehead - ZO_startup_32 + 4095) & ~4095) +#else +# define ZO_z_min_extract_offset ((ZO_z_extract_offset + 4095) & ~4095) +#endif + +#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_min_extract_offset) + #define VO_INIT_SIZE (VO__end - VO__text) #if ZO_INIT_SIZE > VO_INIT_SIZE -#define INIT_SIZE ZO_INIT_SIZE +# define INIT_SIZE ZO_INIT_SIZE #else -#define INIT_SIZE VO_INIT_SIZE +# define INIT_SIZE VO_INIT_SIZE #endif + init_size: .long INIT_SIZE # kernel initialization size handover_offset: .long 0 # Filled in by build.c diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 4f404a64681b..0c8d7963483c 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -173,6 +173,7 @@ CONFIG_TIGON3=y CONFIG_NET_TULIP=y CONFIG_E100=y CONFIG_E1000=y +CONFIG_E1000E=y CONFIG_SKY2=y CONFIG_FORCEDETH=y CONFIG_8139TOO=y diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 064c7e2bd7c8..5b7fa1471007 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -1477,7 +1477,7 @@ static int __init aesni_init(void) } aesni_ctr_enc_tfm = aesni_ctr_enc; #ifdef CONFIG_AS_AVX - if (cpu_has_avx) { + if (boot_cpu_has(X86_FEATURE_AVX)) { /* optimize performance of ctr mode encryption transform */ aesni_ctr_enc_tfm = aesni_ctr_enc_avx_tfm; pr_info("AES CTR mode by8 optimization enabled\n"); diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index d84456924563..60907c139c4e 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -562,7 +562,10 @@ static int __init camellia_aesni_init(void) { const char *feature_name; - if (!cpu_has_avx2 || !cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) { + if (!boot_cpu_has(X86_FEATURE_AVX) || + !boot_cpu_has(X86_FEATURE_AVX2) || + !boot_cpu_has(X86_FEATURE_AES) || + !boot_cpu_has(X86_FEATURE_OSXSAVE)) { pr_info("AVX2 or AES-NI instructions are not detected.\n"); return -ENODEV; } diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index 93d8f295784e..d96429da88eb 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -554,7 +554,9 @@ static int __init camellia_aesni_init(void) { const char *feature_name; - if (!cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) { + if (!boot_cpu_has(X86_FEATURE_AVX) || + !boot_cpu_has(X86_FEATURE_AES) || + !boot_cpu_has(X86_FEATURE_OSXSAVE)) { pr_info("AVX or AES-NI instructions are not detected.\n"); return -ENODEV; } diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index 8baaff5af0b5..2d5c2e0bd939 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -129,7 +129,8 @@ static int __init chacha20_simd_mod_init(void) return -ENODEV; #ifdef CONFIG_AS_AVX2 - chacha20_use_avx2 = cpu_has_avx && cpu_has_avx2 && + chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); #endif return crypto_register_alg(&alg); diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index 4264a3d59589..e32142bc071d 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -179,11 +179,12 @@ static struct shash_alg alg = { static int __init poly1305_simd_mod_init(void) { - if (!cpu_has_xmm2) + if (!boot_cpu_has(X86_FEATURE_XMM2)) return -ENODEV; #ifdef CONFIG_AS_AVX2 - poly1305_use_avx2 = cpu_has_avx && cpu_has_avx2 && + poly1305_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); alg.descsize = sizeof(struct poly1305_simd_desc_ctx); if (poly1305_use_avx2) diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index 6d198342e2de..870f6d812a2d 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -538,7 +538,7 @@ static int __init init(void) { const char *feature_name; - if (!cpu_has_avx2 || !cpu_has_osxsave) { + if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_OSXSAVE)) { pr_info("AVX2 instructions are not detected.\n"); return -ENODEV; } diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 8943407e8917..644f97ab8cac 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -600,7 +600,7 @@ static struct crypto_alg serpent_algs[10] = { { static int __init serpent_sse2_init(void) { - if (!cpu_has_xmm2) { + if (!boot_cpu_has(X86_FEATURE_XMM2)) { printk(KERN_INFO "SSE2 instructions are not detected.\n"); return -ENODEV; } diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c index 081255cea1ee..9c5af331a956 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb.c +++ b/arch/x86/crypto/sha-mb/sha1_mb.c @@ -102,14 +102,14 @@ static asmlinkage struct job_sha1* (*sha1_job_mgr_submit)(struct sha1_mb_mgr *st static asmlinkage struct job_sha1* (*sha1_job_mgr_flush)(struct sha1_mb_mgr *state); static asmlinkage struct job_sha1* (*sha1_job_mgr_get_comp_job)(struct sha1_mb_mgr *state); -inline void sha1_init_digest(uint32_t *digest) +static inline void sha1_init_digest(uint32_t *digest) { static const uint32_t initial_digest[SHA1_DIGEST_LENGTH] = {SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 }; memcpy(digest, initial_digest, sizeof(initial_digest)); } -inline uint32_t sha1_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], +static inline uint32_t sha1_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint32_t total_len) { uint32_t i = total_len & (SHA1_BLOCK_SIZE - 1); diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index dd14616b7739..1024e378a358 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -166,7 +166,7 @@ static struct shash_alg sha1_avx_alg = { static bool avx_usable(void) { if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { - if (cpu_has_avx) + if (boot_cpu_has(X86_FEATURE_AVX)) pr_info("AVX detected but unusable.\n"); return false; } diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index 5f4d6086dc59..3ae0f43ebd37 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -201,7 +201,7 @@ static struct shash_alg sha256_avx_algs[] = { { static bool avx_usable(void) { if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { - if (cpu_has_avx) + if (boot_cpu_has(X86_FEATURE_AVX)) pr_info("AVX detected but unusable.\n"); return false; } diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 34e5083d6f36..0b17c83d027d 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -151,7 +151,7 @@ asmlinkage void sha512_transform_avx(u64 *digest, const char *data, static bool avx_usable(void) { if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { - if (cpu_has_avx) + if (boot_cpu_has(X86_FEATURE_AVX)) pr_info("AVX detected but unusable.\n"); return false; } diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index e79d93d44ecd..ec138e538c44 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -191,7 +191,7 @@ long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, long syscall_trace_enter(struct pt_regs *regs) { - u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; + u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch); if (phase1_result == 0) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 10868aa734dc..983e5d3a0d27 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -207,10 +207,7 @@ ENTRY(ret_from_fork) pushl %eax call schedule_tail - GET_THREAD_INFO(%ebp) popl %eax - pushl $0x0202 # Reset kernel eflags - popfl /* When we fork, we trace the syscall return in the child, too. */ movl %esp, %eax @@ -221,10 +218,7 @@ END(ret_from_fork) ENTRY(ret_from_kernel_thread) pushl %eax call schedule_tail - GET_THREAD_INFO(%ebp) popl %eax - pushl $0x0202 # Reset kernel eflags - popfl movl PT_EBP(%esp), %eax call *PT_EBX(%esp) movl $0, PT_EAX(%esp) @@ -251,7 +245,6 @@ ENDPROC(ret_from_kernel_thread) ret_from_exception: preempt_stop(CLBR_ANY) ret_from_intr: - GET_THREAD_INFO(%ebp) #ifdef CONFIG_VM86 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS movb PT_CS(%esp), %al diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 858b555e274b..9ee0da1807ed 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -372,9 +372,6 @@ END(ptregs_\func) ENTRY(ret_from_fork) LOCK ; btr $TIF_FORK, TI_flags(%r8) - pushq $0x0002 - popfq /* reset kernel eflags */ - call schedule_tail /* rdi: 'prev' task parameter */ testb $3, CS(%rsp) /* from kernel_thread? */ @@ -781,19 +778,25 @@ ENTRY(native_load_gs_index) pushfq DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) SWAPGS -gs_change: +.Lgs_change: movl %edi, %gs -2: mfence /* workaround */ +2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE SWAPGS popfq ret END(native_load_gs_index) - _ASM_EXTABLE(gs_change, bad_gs) + _ASM_EXTABLE(.Lgs_change, bad_gs) .section .fixup, "ax" /* running with kernelgs */ bad_gs: SWAPGS /* switch back to user gs */ +.macro ZAP_GS + /* This can't be a string because the preprocessor needs to see it. */ + movl $__USER_DS, %eax + movl %eax, %gs +.endm + ALTERNATIVE "", "ZAP_GS", X86_BUG_NULL_SEG xorl %eax, %eax movl %eax, %gs jmp 2b @@ -1019,13 +1022,13 @@ ENTRY(error_entry) movl %ecx, %eax /* zero extend */ cmpq %rax, RIP+8(%rsp) je .Lbstep_iret - cmpq $gs_change, RIP+8(%rsp) + cmpq $.Lgs_change, RIP+8(%rsp) jne .Lerror_entry_done /* - * hack: gs_change can fail with user gsbase. If this happens, fix up + * hack: .Lgs_change can fail with user gsbase. If this happens, fix up * gsbase and proceed. We'll fix up the exception and land in - * gs_change's error handler with kernel gsbase. + * .Lgs_change's error handler with kernel gsbase. */ jmp .Lerror_entry_from_usermode_swapgs diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 847f2f0c31e5..e1721dafbcb1 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -72,24 +72,23 @@ ENTRY(entry_SYSENTER_compat) pushfq /* pt_regs->flags (except IF = 0) */ orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */ pushq $__USER32_CS /* pt_regs->cs */ - xorq %r8,%r8 - pushq %r8 /* pt_regs->ip = 0 (placeholder) */ + pushq $0 /* pt_regs->ip = 0 (placeholder) */ pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ pushq %rdx /* pt_regs->dx */ pushq %rcx /* pt_regs->cx */ pushq $-ENOSYS /* pt_regs->ax */ - pushq %r8 /* pt_regs->r8 = 0 */ - pushq %r8 /* pt_regs->r9 = 0 */ - pushq %r8 /* pt_regs->r10 = 0 */ - pushq %r8 /* pt_regs->r11 = 0 */ + pushq $0 /* pt_regs->r8 = 0 */ + pushq $0 /* pt_regs->r9 = 0 */ + pushq $0 /* pt_regs->r10 = 0 */ + pushq $0 /* pt_regs->r11 = 0 */ pushq %rbx /* pt_regs->rbx */ pushq %rbp /* pt_regs->rbp (will be overwritten) */ - pushq %r8 /* pt_regs->r12 = 0 */ - pushq %r8 /* pt_regs->r13 = 0 */ - pushq %r8 /* pt_regs->r14 = 0 */ - pushq %r8 /* pt_regs->r15 = 0 */ + pushq $0 /* pt_regs->r12 = 0 */ + pushq $0 /* pt_regs->r13 = 0 */ + pushq $0 /* pt_regs->r14 = 0 */ + pushq $0 /* pt_regs->r15 = 0 */ cld /* @@ -205,17 +204,16 @@ ENTRY(entry_SYSCALL_compat) pushq %rdx /* pt_regs->dx */ pushq %rbp /* pt_regs->cx (stashed in bp) */ pushq $-ENOSYS /* pt_regs->ax */ - xorq %r8,%r8 - pushq %r8 /* pt_regs->r8 = 0 */ - pushq %r8 /* pt_regs->r9 = 0 */ - pushq %r8 /* pt_regs->r10 = 0 */ - pushq %r8 /* pt_regs->r11 = 0 */ + pushq $0 /* pt_regs->r8 = 0 */ + pushq $0 /* pt_regs->r9 = 0 */ + pushq $0 /* pt_regs->r10 = 0 */ + pushq $0 /* pt_regs->r11 = 0 */ pushq %rbx /* pt_regs->rbx */ pushq %rbp /* pt_regs->rbp (will be overwritten) */ - pushq %r8 /* pt_regs->r12 = 0 */ - pushq %r8 /* pt_regs->r13 = 0 */ - pushq %r8 /* pt_regs->r14 = 0 */ - pushq %r8 /* pt_regs->r15 = 0 */ + pushq $0 /* pt_regs->r12 = 0 */ + pushq $0 /* pt_regs->r13 = 0 */ + pushq $0 /* pt_regs->r14 = 0 */ + pushq $0 /* pt_regs->r15 = 0 */ /* * User mode is traced as though IRQs are on, and SYSENTER @@ -316,11 +314,10 @@ ENTRY(entry_INT80_compat) pushq %rdx /* pt_regs->dx */ pushq %rcx /* pt_regs->cx */ pushq $-ENOSYS /* pt_regs->ax */ - xorq %r8,%r8 - pushq %r8 /* pt_regs->r8 = 0 */ - pushq %r8 /* pt_regs->r9 = 0 */ - pushq %r8 /* pt_regs->r10 = 0 */ - pushq %r8 /* pt_regs->r11 = 0 */ + pushq $0 /* pt_regs->r8 = 0 */ + pushq $0 /* pt_regs->r9 = 0 */ + pushq $0 /* pt_regs->r10 = 0 */ + pushq $0 /* pt_regs->r11 = 0 */ pushq %rbx /* pt_regs->rbx */ pushq %rbp /* pt_regs->rbp */ pushq %r12 /* pt_regs->r12 */ diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index cac6d17ce5db..555263e385c9 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -374,3 +374,5 @@ 543 x32 io_setup compat_sys_io_setup 544 x32 io_submit compat_sys_io_submit 545 x32 execveat compat_sys_execveat/ptregs +534 x32 preadv2 compat_sys_preadv2 +535 x32 pwritev2 compat_sys_pwritev2 diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 03c3eb77bfce..2f02d23a05ef 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -13,7 +13,6 @@ #include <uapi/linux/time.h> #include <asm/vgtod.h> -#include <asm/hpet.h> #include <asm/vvar.h> #include <asm/unistd.h> #include <asm/msr.h> @@ -28,16 +27,6 @@ extern int __vdso_clock_gettime(clockid_t clock, struct timespec *ts); extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz); extern time_t __vdso_time(time_t *t); -#ifdef CONFIG_HPET_TIMER -extern u8 hpet_page - __attribute__((visibility("hidden"))); - -static notrace cycle_t vread_hpet(void) -{ - return *(const volatile u32 *)(&hpet_page + HPET_COUNTER); -} -#endif - #ifdef CONFIG_PARAVIRT_CLOCK extern u8 pvclock_page __attribute__((visibility("hidden"))); @@ -195,10 +184,6 @@ notrace static inline u64 vgetsns(int *mode) if (gtod->vclock_mode == VCLOCK_TSC) cycles = vread_tsc(); -#ifdef CONFIG_HPET_TIMER - else if (gtod->vclock_mode == VCLOCK_HPET) - cycles = vread_hpet(); -#endif #ifdef CONFIG_PARAVIRT_CLOCK else if (gtod->vclock_mode == VCLOCK_PVCLOCK) cycles = vread_pvclock(mode); diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S index 4158acc17df0..a708aa90b507 100644 --- a/arch/x86/entry/vdso/vdso-layout.lds.S +++ b/arch/x86/entry/vdso/vdso-layout.lds.S @@ -25,7 +25,7 @@ SECTIONS * segment. */ - vvar_start = . - 3 * PAGE_SIZE; + vvar_start = . - 2 * PAGE_SIZE; vvar_page = vvar_start; /* Place all vvars at the offsets in asm/vvar.h. */ @@ -35,8 +35,7 @@ SECTIONS #undef __VVAR_KERNEL_LDS #undef EMIT_VVAR - hpet_page = vvar_start + PAGE_SIZE; - pvclock_page = vvar_start + 2 * PAGE_SIZE; + pvclock_page = vvar_start + PAGE_SIZE; . = SIZEOF_HEADERS; diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 10f704584922..b3cf81333a54 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -18,7 +18,6 @@ #include <asm/vdso.h> #include <asm/vvar.h> #include <asm/page.h> -#include <asm/hpet.h> #include <asm/desc.h> #include <asm/cpufeature.h> @@ -129,16 +128,6 @@ static int vvar_fault(const struct vm_special_mapping *sm, if (sym_offset == image->sym_vvar_page) { ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, __pa_symbol(&__vvar_page) >> PAGE_SHIFT); - } else if (sym_offset == image->sym_hpet_page) { -#ifdef CONFIG_HPET_TIMER - if (hpet_address && vclock_was_used(VCLOCK_HPET)) { - ret = vm_insert_pfn_prot( - vma, - (unsigned long)vmf->virtual_address, - hpet_address >> PAGE_SHIFT, - pgprot_noncached(PAGE_READONLY)); - } -#endif } else if (sym_offset == image->sym_pvclock_page) { struct pvclock_vsyscall_time_info *pvti = pvclock_pvti_cpu0_va(); diff --git a/arch/x86/events/Kconfig b/arch/x86/events/Kconfig new file mode 100644 index 000000000000..98397db5ceae --- /dev/null +++ b/arch/x86/events/Kconfig @@ -0,0 +1,36 @@ +menu "Performance monitoring" + +config PERF_EVENTS_INTEL_UNCORE + tristate "Intel uncore performance events" + depends on PERF_EVENTS && CPU_SUP_INTEL && PCI + default y + ---help--- + Include support for Intel uncore performance events. These are + available on NehalemEX and more modern processors. + +config PERF_EVENTS_INTEL_RAPL + tristate "Intel rapl performance events" + depends on PERF_EVENTS && CPU_SUP_INTEL && PCI + default y + ---help--- + Include support for Intel rapl performance events for power + monitoring on modern processors. + +config PERF_EVENTS_INTEL_CSTATE + tristate "Intel cstate performance events" + depends on PERF_EVENTS && CPU_SUP_INTEL && PCI + default y + ---help--- + Include support for Intel cstate performance events for power + monitoring on modern processors. + +config PERF_EVENTS_AMD_POWER + depends on PERF_EVENTS && CPU_SUP_AMD + tristate "AMD Processor Power Reporting Mechanism" + ---help--- + Provide power reporting mechanism support for AMD processors. + Currently, it leverages X86_FEATURE_ACC_POWER + (CPUID Fn8000_0007_EDX[12]) interface to calculate the + average power consumption on Family 15h processors. + +endmenu diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile index f59618a39990..1d392c39fe56 100644 --- a/arch/x86/events/Makefile +++ b/arch/x86/events/Makefile @@ -6,9 +6,6 @@ obj-$(CONFIG_X86_LOCAL_APIC) += amd/ibs.o msr.o ifdef CONFIG_AMD_IOMMU obj-$(CONFIG_CPU_SUP_AMD) += amd/iommu.o endif -obj-$(CONFIG_CPU_SUP_INTEL) += intel/core.o intel/bts.o intel/cqm.o -obj-$(CONFIG_CPU_SUP_INTEL) += intel/cstate.o intel/ds.o intel/knc.o -obj-$(CONFIG_CPU_SUP_INTEL) += intel/lbr.o intel/p4.o intel/p6.o intel/pt.o -obj-$(CONFIG_CPU_SUP_INTEL) += intel/rapl.o msr.o -obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel/uncore.o intel/uncore_nhmex.o -obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel/uncore_snb.o intel/uncore_snbep.o + +obj-$(CONFIG_CPU_SUP_INTEL) += msr.o +obj-$(CONFIG_CPU_SUP_INTEL) += intel/ diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 3db9569e658c..98ac57381bf9 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -263,6 +263,7 @@ static const struct attribute_group *amd_uncore_attr_groups[] = { }; static struct pmu amd_nb_pmu = { + .task_ctx_nr = perf_invalid_context, .attr_groups = amd_uncore_attr_groups, .name = "amd_nb", .event_init = amd_uncore_event_init, @@ -274,6 +275,7 @@ static struct pmu amd_nb_pmu = { }; static struct pmu amd_l2_pmu = { + .task_ctx_nr = perf_invalid_context, .attr_groups = amd_uncore_attr_groups, .name = "amd_l2", .event_init = amd_uncore_event_init, diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 041e442a3e28..73a75aa5a66d 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -360,6 +360,9 @@ int x86_add_exclusive(unsigned int what) { int i; + if (x86_pmu.lbr_pt_coexist) + return 0; + if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) { mutex_lock(&pmc_reserve_mutex); for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) { @@ -380,6 +383,9 @@ fail_unlock: void x86_del_exclusive(unsigned int what) { + if (x86_pmu.lbr_pt_coexist) + return; + atomic_dec(&x86_pmu.lbr_exclusive[what]); atomic_dec(&active_events); } @@ -1518,7 +1524,7 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) static void __init pmu_check_apic(void) { - if (cpu_has_apic) + if (boot_cpu_has(X86_FEATURE_APIC)) return; x86_pmu.apic = 0; @@ -2177,7 +2183,7 @@ void arch_perf_update_userpage(struct perf_event *event, * cap_user_time_zero doesn't make sense when we're using a different * time base for the records. */ - if (event->clock == &local_clock) { + if (!event->attr.use_clockid) { userpg->cap_user_time_zero = 1; userpg->time_zero = data->cyc2ns_offset; } @@ -2277,7 +2283,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) fp = compat_ptr(ss_base + regs->bp); pagefault_disable(); - while (entry->nr < PERF_MAX_STACK_DEPTH) { + while (entry->nr < sysctl_perf_event_max_stack) { unsigned long bytes; frame.next_frame = 0; frame.return_address = 0; @@ -2337,7 +2343,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) return; pagefault_disable(); - while (entry->nr < PERF_MAX_STACK_DEPTH) { + while (entry->nr < sysctl_perf_event_max_stack) { unsigned long bytes; frame.next_frame = NULL; frame.return_address = 0; diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile new file mode 100644 index 000000000000..3660b2cf245a --- /dev/null +++ b/arch/x86/events/intel/Makefile @@ -0,0 +1,9 @@ +obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o cqm.o +obj-$(CONFIG_CPU_SUP_INTEL) += ds.o knc.o +obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o +obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += intel-rapl.o +intel-rapl-objs := rapl.o +obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel-uncore.o +intel-uncore-objs := uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o +obj-$(CONFIG_PERF_EVENTS_INTEL_CSTATE) += intel-cstate.o +intel-cstate-objs := cstate.o diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index b99dc9258c0f..0a6e393a2e62 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -171,18 +171,6 @@ static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head) memset(page_address(phys->page) + index, 0, phys->size - index); } -static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts) -{ - if (buf->snapshot) - return false; - - if (local_read(&buf->data_size) >= bts->handle.size || - bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE) - return true; - - return false; -} - static void bts_update(struct bts_ctx *bts) { int cpu = raw_smp_processor_id(); @@ -213,18 +201,15 @@ static void bts_update(struct bts_ctx *bts) } } +static int +bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle); + static void __bts_event_start(struct perf_event *event) { struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); struct bts_buffer *buf = perf_get_aux(&bts->handle); u64 config = 0; - if (!buf || bts_buffer_is_full(buf, bts)) - return; - - event->hw.itrace_started = 1; - event->hw.state = 0; - if (!buf->snapshot) config |= ARCH_PERFMON_EVENTSEL_INT; if (!event->attr.exclude_kernel) @@ -241,16 +226,41 @@ static void __bts_event_start(struct perf_event *event) wmb(); intel_pmu_enable_bts(config); + } static void bts_event_start(struct perf_event *event, int flags) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + struct bts_buffer *buf; + + buf = perf_aux_output_begin(&bts->handle, event); + if (!buf) + goto fail_stop; + + if (bts_buffer_reset(buf, &bts->handle)) + goto fail_end_stop; + + bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base; + bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum; + bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold; + + event->hw.itrace_started = 1; + event->hw.state = 0; __bts_event_start(event); /* PMI handler: this counter is running and likely generating PMIs */ ACCESS_ONCE(bts->started) = 1; + + return; + +fail_end_stop: + perf_aux_output_end(&bts->handle, 0, false); + +fail_stop: + event->hw.state = PERF_HES_STOPPED; } static void __bts_event_stop(struct perf_event *event) @@ -269,15 +279,32 @@ static void __bts_event_stop(struct perf_event *event) static void bts_event_stop(struct perf_event *event, int flags) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + struct bts_buffer *buf = perf_get_aux(&bts->handle); /* PMI handler: don't restart this counter */ ACCESS_ONCE(bts->started) = 0; __bts_event_stop(event); - if (flags & PERF_EF_UPDATE) + if (flags & PERF_EF_UPDATE) { bts_update(bts); + + if (buf) { + if (buf->snapshot) + bts->handle.head = + local_xchg(&buf->data_size, + buf->nr_pages << PAGE_SHIFT); + perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0), + !!local_xchg(&buf->lost, 0)); + } + + cpuc->ds->bts_index = bts->ds_back.bts_buffer_base; + cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base; + cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum; + cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold; + } } void intel_bts_enable_local(void) @@ -417,34 +444,14 @@ int intel_bts_interrupt(void) static void bts_event_del(struct perf_event *event, int mode) { - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); - struct bts_buffer *buf = perf_get_aux(&bts->handle); - bts_event_stop(event, PERF_EF_UPDATE); - - if (buf) { - if (buf->snapshot) - bts->handle.head = - local_xchg(&buf->data_size, - buf->nr_pages << PAGE_SHIFT); - perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0), - !!local_xchg(&buf->lost, 0)); - } - - cpuc->ds->bts_index = bts->ds_back.bts_buffer_base; - cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base; - cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum; - cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold; } static int bts_event_add(struct perf_event *event, int mode) { - struct bts_buffer *buf; struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - int ret = -EBUSY; event->hw.state = PERF_HES_STOPPED; @@ -454,26 +461,10 @@ static int bts_event_add(struct perf_event *event, int mode) if (bts->handle.event) return -EBUSY; - buf = perf_aux_output_begin(&bts->handle, event); - if (!buf) - return -EINVAL; - - ret = bts_buffer_reset(buf, &bts->handle); - if (ret) { - perf_aux_output_end(&bts->handle, 0, false); - return ret; - } - - bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base; - bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum; - bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold; - if (mode & PERF_EF_START) { bts_event_start(event, 0); - if (hwc->state & PERF_HES_STOPPED) { - bts_event_del(event, 0); - return -EBUSY; - } + if (hwc->state & PERF_HES_STOPPED) + return -EINVAL; } return 0; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index a6fd4dbcf820..7c666958a625 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -1465,6 +1465,140 @@ static __initconst const u64 slm_hw_cache_event_ids }, }; +static struct extra_reg intel_glm_extra_regs[] __read_mostly = { + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x760005ffbfull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x360005ffbfull, RSP_1), + EVENT_EXTRA_END +}; + +#define GLM_DEMAND_DATA_RD BIT_ULL(0) +#define GLM_DEMAND_RFO BIT_ULL(1) +#define GLM_ANY_RESPONSE BIT_ULL(16) +#define GLM_SNP_NONE_OR_MISS BIT_ULL(33) +#define GLM_DEMAND_READ GLM_DEMAND_DATA_RD +#define GLM_DEMAND_WRITE GLM_DEMAND_RFO +#define GLM_DEMAND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO) +#define GLM_LLC_ACCESS GLM_ANY_RESPONSE +#define GLM_SNP_ANY (GLM_SNP_NONE_OR_MISS|SNB_NO_FWD|SNB_HITM) +#define GLM_LLC_MISS (GLM_SNP_ANY|SNB_NON_DRAM) + +static __initconst const u64 glm_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { + [C(L1D)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */ + [C(RESULT_MISS)] = 0x0, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */ + [C(RESULT_MISS)] = 0x0, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, + }, + [C(L1I)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0380, /* ICACHE.ACCESSES */ + [C(RESULT_MISS)] = 0x0280, /* ICACHE.MISSES */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, + }, + [C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */ + [C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */ + [C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */ + [C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + }, + [C(DTLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */ + [C(RESULT_MISS)] = 0x0, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */ + [C(RESULT_MISS)] = 0x0, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, + }, + [C(ITLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x00c0, /* INST_RETIRED.ANY_P */ + [C(RESULT_MISS)] = 0x0481, /* ITLB.MISS */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + }, + [C(BPU)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [C(RESULT_MISS)] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + }, +}; + +static __initconst const u64 glm_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { + [C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = GLM_DEMAND_READ| + GLM_LLC_ACCESS, + [C(RESULT_MISS)] = GLM_DEMAND_READ| + GLM_LLC_MISS, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = GLM_DEMAND_WRITE| + GLM_LLC_ACCESS, + [C(RESULT_MISS)] = GLM_DEMAND_WRITE| + GLM_LLC_MISS, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = GLM_DEMAND_PREFETCH| + GLM_LLC_ACCESS, + [C(RESULT_MISS)] = GLM_DEMAND_PREFETCH| + GLM_LLC_MISS, + }, + }, +}; + #define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */ #define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */ #define KNL_MCDRAM_LOCAL BIT_ULL(21) @@ -3447,7 +3581,7 @@ __init int intel_pmu_init(void) memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); - intel_pmu_lbr_init_atom(); + intel_pmu_lbr_init_slm(); x86_pmu.event_constraints = intel_slm_event_constraints; x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints; @@ -3456,6 +3590,30 @@ __init int intel_pmu_init(void) pr_cont("Silvermont events, "); break; + case 92: /* 14nm Atom "Goldmont" */ + case 95: /* 14nm Atom "Goldmont Denverton" */ + memcpy(hw_cache_event_ids, glm_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + + intel_pmu_lbr_init_skl(); + + x86_pmu.event_constraints = intel_slm_event_constraints; + x86_pmu.pebs_constraints = intel_glm_pebs_event_constraints; + x86_pmu.extra_regs = intel_glm_extra_regs; + /* + * It's recommended to use CPU_CLK_UNHALTED.CORE_P + NPEBS + * for precise cycles. + * :pp is identical to :ppp + */ + x86_pmu.pebs_aliases = NULL; + x86_pmu.pebs_prec_dist = true; + x86_pmu.lbr_pt_coexist = true; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + pr_cont("Goldmont events, "); + break; + case 37: /* 32nm Westmere */ case 44: /* 32nm Westmere-EP */ case 47: /* 32nm Westmere-EX */ @@ -3708,7 +3866,7 @@ __init int intel_pmu_init(void) c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; } c->idxmsk64 &= - ~(~0UL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed)); + ~(~0ULL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed)); c->weight = hweight64(c->idxmsk64); } } diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 7946c4231169..9ba4e4136a15 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -91,6 +91,8 @@ #include <asm/cpu_device_id.h> #include "../perf_event.h" +MODULE_LICENSE("GPL"); + #define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format) \ static ssize_t __cstate_##_var##_show(struct kobject *kobj, \ struct kobj_attribute *attr, \ @@ -106,22 +108,27 @@ static ssize_t cstate_get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf); +/* Model -> events mapping */ +struct cstate_model { + unsigned long core_events; + unsigned long pkg_events; + unsigned long quirks; +}; + +/* Quirk flags */ +#define SLM_PKG_C6_USE_C7_MSR (1UL << 0) + struct perf_cstate_msr { u64 msr; struct perf_pmu_events_attr *attr; - bool (*test)(int idx); }; /* cstate_core PMU */ - static struct pmu cstate_core_pmu; static bool has_cstate_core; -enum perf_cstate_core_id { - /* - * cstate_core events - */ +enum perf_cstate_core_events { PERF_CSTATE_CORE_C1_RES = 0, PERF_CSTATE_CORE_C3_RES, PERF_CSTATE_CORE_C6_RES, @@ -130,69 +137,16 @@ enum perf_cstate_core_id { PERF_CSTATE_CORE_EVENT_MAX, }; -bool test_core(int idx) -{ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || - boot_cpu_data.x86 != 6) - return false; - - switch (boot_cpu_data.x86_model) { - case 30: /* 45nm Nehalem */ - case 26: /* 45nm Nehalem-EP */ - case 46: /* 45nm Nehalem-EX */ - - case 37: /* 32nm Westmere */ - case 44: /* 32nm Westmere-EP */ - case 47: /* 32nm Westmere-EX */ - if (idx == PERF_CSTATE_CORE_C3_RES || - idx == PERF_CSTATE_CORE_C6_RES) - return true; - break; - case 42: /* 32nm SandyBridge */ - case 45: /* 32nm SandyBridge-E/EN/EP */ - - case 58: /* 22nm IvyBridge */ - case 62: /* 22nm IvyBridge-EP/EX */ - - case 60: /* 22nm Haswell Core */ - case 63: /* 22nm Haswell Server */ - case 69: /* 22nm Haswell ULT */ - case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ - - case 61: /* 14nm Broadwell Core-M */ - case 86: /* 14nm Broadwell Xeon D */ - case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */ - case 79: /* 14nm Broadwell Server */ - - case 78: /* 14nm Skylake Mobile */ - case 94: /* 14nm Skylake Desktop */ - if (idx == PERF_CSTATE_CORE_C3_RES || - idx == PERF_CSTATE_CORE_C6_RES || - idx == PERF_CSTATE_CORE_C7_RES) - return true; - break; - case 55: /* 22nm Atom "Silvermont" */ - case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */ - case 76: /* 14nm Atom "Airmont" */ - if (idx == PERF_CSTATE_CORE_C1_RES || - idx == PERF_CSTATE_CORE_C6_RES) - return true; - break; - } - - return false; -} - PMU_EVENT_ATTR_STRING(c1-residency, evattr_cstate_core_c1, "event=0x00"); PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_core_c3, "event=0x01"); PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_core_c6, "event=0x02"); PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_core_c7, "event=0x03"); static struct perf_cstate_msr core_msr[] = { - [PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES, &evattr_cstate_core_c1, test_core, }, - [PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY, &evattr_cstate_core_c3, test_core, }, - [PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY, &evattr_cstate_core_c6, test_core, }, - [PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY, &evattr_cstate_core_c7, test_core, }, + [PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES, &evattr_cstate_core_c1 }, + [PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY, &evattr_cstate_core_c3 }, + [PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY, &evattr_cstate_core_c6 }, + [PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY, &evattr_cstate_core_c7 }, }; static struct attribute *core_events_attrs[PERF_CSTATE_CORE_EVENT_MAX + 1] = { @@ -234,18 +188,11 @@ static const struct attribute_group *core_attr_groups[] = { NULL, }; -/* cstate_core PMU end */ - - /* cstate_pkg PMU */ - static struct pmu cstate_pkg_pmu; static bool has_cstate_pkg; -enum perf_cstate_pkg_id { - /* - * cstate_pkg events - */ +enum perf_cstate_pkg_events { PERF_CSTATE_PKG_C2_RES = 0, PERF_CSTATE_PKG_C3_RES, PERF_CSTATE_PKG_C6_RES, @@ -257,69 +204,6 @@ enum perf_cstate_pkg_id { PERF_CSTATE_PKG_EVENT_MAX, }; -bool test_pkg(int idx) -{ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || - boot_cpu_data.x86 != 6) - return false; - - switch (boot_cpu_data.x86_model) { - case 30: /* 45nm Nehalem */ - case 26: /* 45nm Nehalem-EP */ - case 46: /* 45nm Nehalem-EX */ - - case 37: /* 32nm Westmere */ - case 44: /* 32nm Westmere-EP */ - case 47: /* 32nm Westmere-EX */ - if (idx == PERF_CSTATE_CORE_C3_RES || - idx == PERF_CSTATE_CORE_C6_RES || - idx == PERF_CSTATE_CORE_C7_RES) - return true; - break; - case 42: /* 32nm SandyBridge */ - case 45: /* 32nm SandyBridge-E/EN/EP */ - - case 58: /* 22nm IvyBridge */ - case 62: /* 22nm IvyBridge-EP/EX */ - - case 60: /* 22nm Haswell Core */ - case 63: /* 22nm Haswell Server */ - case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */ - - case 61: /* 14nm Broadwell Core-M */ - case 86: /* 14nm Broadwell Xeon D */ - case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */ - case 79: /* 14nm Broadwell Server */ - - case 78: /* 14nm Skylake Mobile */ - case 94: /* 14nm Skylake Desktop */ - if (idx == PERF_CSTATE_PKG_C2_RES || - idx == PERF_CSTATE_PKG_C3_RES || - idx == PERF_CSTATE_PKG_C6_RES || - idx == PERF_CSTATE_PKG_C7_RES) - return true; - break; - case 55: /* 22nm Atom "Silvermont" */ - case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */ - case 76: /* 14nm Atom "Airmont" */ - if (idx == PERF_CSTATE_CORE_C6_RES) - return true; - break; - case 69: /* 22nm Haswell ULT */ - if (idx == PERF_CSTATE_PKG_C2_RES || - idx == PERF_CSTATE_PKG_C3_RES || - idx == PERF_CSTATE_PKG_C6_RES || - idx == PERF_CSTATE_PKG_C7_RES || - idx == PERF_CSTATE_PKG_C8_RES || - idx == PERF_CSTATE_PKG_C9_RES || - idx == PERF_CSTATE_PKG_C10_RES) - return true; - break; - } - - return false; -} - PMU_EVENT_ATTR_STRING(c2-residency, evattr_cstate_pkg_c2, "event=0x00"); PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_pkg_c3, "event=0x01"); PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_pkg_c6, "event=0x02"); @@ -329,13 +213,13 @@ PMU_EVENT_ATTR_STRING(c9-residency, evattr_cstate_pkg_c9, "event=0x05"); PMU_EVENT_ATTR_STRING(c10-residency, evattr_cstate_pkg_c10, "event=0x06"); static struct perf_cstate_msr pkg_msr[] = { - [PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY, &evattr_cstate_pkg_c2, test_pkg, }, - [PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY, &evattr_cstate_pkg_c3, test_pkg, }, - [PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY, &evattr_cstate_pkg_c6, test_pkg, }, - [PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY, &evattr_cstate_pkg_c7, test_pkg, }, - [PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY, &evattr_cstate_pkg_c8, test_pkg, }, - [PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY, &evattr_cstate_pkg_c9, test_pkg, }, - [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &evattr_cstate_pkg_c10, test_pkg, }, + [PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY, &evattr_cstate_pkg_c2 }, + [PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY, &evattr_cstate_pkg_c3 }, + [PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY, &evattr_cstate_pkg_c6 }, + [PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY, &evattr_cstate_pkg_c7 }, + [PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY, &evattr_cstate_pkg_c8 }, + [PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY, &evattr_cstate_pkg_c9 }, + [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &evattr_cstate_pkg_c10 }, }; static struct attribute *pkg_events_attrs[PERF_CSTATE_PKG_EVENT_MAX + 1] = { @@ -366,8 +250,6 @@ static const struct attribute_group *pkg_attr_groups[] = { NULL, }; -/* cstate_pkg PMU end*/ - static ssize_t cstate_get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf) @@ -385,7 +267,7 @@ static ssize_t cstate_get_attr_cpumask(struct device *dev, static int cstate_pmu_event_init(struct perf_event *event) { u64 cfg = event->attr.config; - int ret = 0; + int cpu; if (event->attr.type != event->pmu->type) return -ENOENT; @@ -400,26 +282,36 @@ static int cstate_pmu_event_init(struct perf_event *event) event->attr.sample_period) /* no sampling */ return -EINVAL; + if (event->cpu < 0) + return -EINVAL; + if (event->pmu == &cstate_core_pmu) { if (cfg >= PERF_CSTATE_CORE_EVENT_MAX) return -EINVAL; if (!core_msr[cfg].attr) return -EINVAL; event->hw.event_base = core_msr[cfg].msr; + cpu = cpumask_any_and(&cstate_core_cpu_mask, + topology_sibling_cpumask(event->cpu)); } else if (event->pmu == &cstate_pkg_pmu) { if (cfg >= PERF_CSTATE_PKG_EVENT_MAX) return -EINVAL; if (!pkg_msr[cfg].attr) return -EINVAL; event->hw.event_base = pkg_msr[cfg].msr; - } else + cpu = cpumask_any_and(&cstate_pkg_cpu_mask, + topology_core_cpumask(event->cpu)); + } else { return -ENOENT; + } + + if (cpu >= nr_cpu_ids) + return -ENODEV; - /* must be done before validate_group */ + event->cpu = cpu; event->hw.config = cfg; event->hw.idx = -1; - - return ret; + return 0; } static inline u64 cstate_pmu_read_counter(struct perf_event *event) @@ -469,172 +361,91 @@ static int cstate_pmu_event_add(struct perf_event *event, int mode) return 0; } +/* + * Check if exiting cpu is the designated reader. If so migrate the + * events when there is a valid target available + */ static void cstate_cpu_exit(int cpu) { - int i, id, target; + unsigned int target; - /* cpu exit for cstate core */ - if (has_cstate_core) { - id = topology_core_id(cpu); - target = -1; - - for_each_online_cpu(i) { - if (i == cpu) - continue; - if (id == topology_core_id(i)) { - target = i; - break; - } - } - if (cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask) && target >= 0) + if (has_cstate_core && + cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask)) { + + target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu); + /* Migrate events if there is a valid target */ + if (target < nr_cpu_ids) { cpumask_set_cpu(target, &cstate_core_cpu_mask); - WARN_ON(cpumask_empty(&cstate_core_cpu_mask)); - if (target >= 0) perf_pmu_migrate_context(&cstate_core_pmu, cpu, target); + } } - /* cpu exit for cstate pkg */ - if (has_cstate_pkg) { - id = topology_physical_package_id(cpu); - target = -1; - - for_each_online_cpu(i) { - if (i == cpu) - continue; - if (id == topology_physical_package_id(i)) { - target = i; - break; - } - } - if (cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask) && target >= 0) + if (has_cstate_pkg && + cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask)) { + + target = cpumask_any_but(topology_core_cpumask(cpu), cpu); + /* Migrate events if there is a valid target */ + if (target < nr_cpu_ids) { cpumask_set_cpu(target, &cstate_pkg_cpu_mask); - WARN_ON(cpumask_empty(&cstate_pkg_cpu_mask)); - if (target >= 0) perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target); + } } } static void cstate_cpu_init(int cpu) { - int i, id; + unsigned int target; - /* cpu init for cstate core */ - if (has_cstate_core) { - id = topology_core_id(cpu); - for_each_cpu(i, &cstate_core_cpu_mask) { - if (id == topology_core_id(i)) - break; - } - if (i >= nr_cpu_ids) - cpumask_set_cpu(cpu, &cstate_core_cpu_mask); - } + /* + * If this is the first online thread of that core, set it in + * the core cpu mask as the designated reader. + */ + target = cpumask_any_and(&cstate_core_cpu_mask, + topology_sibling_cpumask(cpu)); - /* cpu init for cstate pkg */ - if (has_cstate_pkg) { - id = topology_physical_package_id(cpu); - for_each_cpu(i, &cstate_pkg_cpu_mask) { - if (id == topology_physical_package_id(i)) - break; - } - if (i >= nr_cpu_ids) - cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask); - } + if (has_cstate_core && target >= nr_cpu_ids) + cpumask_set_cpu(cpu, &cstate_core_cpu_mask); + + /* + * If this is the first online thread of that package, set it + * in the package cpu mask as the designated reader. + */ + target = cpumask_any_and(&cstate_pkg_cpu_mask, + topology_core_cpumask(cpu)); + if (has_cstate_pkg && target >= nr_cpu_ids) + cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask); } static int cstate_cpu_notifier(struct notifier_block *self, - unsigned long action, void *hcpu) + unsigned long action, void *hcpu) { unsigned int cpu = (long)hcpu; switch (action & ~CPU_TASKS_FROZEN) { - case CPU_UP_PREPARE: - break; case CPU_STARTING: cstate_cpu_init(cpu); break; - case CPU_UP_CANCELED: - case CPU_DYING: - break; - case CPU_ONLINE: - case CPU_DEAD: - break; case CPU_DOWN_PREPARE: cstate_cpu_exit(cpu); break; default: break; } - return NOTIFY_OK; } -/* - * Probe the cstate events and insert the available one into sysfs attrs - * Return false if there is no available events. - */ -static bool cstate_probe_msr(struct perf_cstate_msr *msr, - struct attribute **events_attrs, - int max_event_nr) -{ - int i, j = 0; - u64 val; - - /* Probe the cstate events. */ - for (i = 0; i < max_event_nr; i++) { - if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val)) - msr[i].attr = NULL; - } - - /* List remaining events in the sysfs attrs. */ - for (i = 0; i < max_event_nr; i++) { - if (msr[i].attr) - events_attrs[j++] = &msr[i].attr->attr.attr; - } - events_attrs[j] = NULL; - - return (j > 0) ? true : false; -} - -static int __init cstate_init(void) -{ - /* SLM has different MSR for PKG C6 */ - switch (boot_cpu_data.x86_model) { - case 55: - case 76: - case 77: - pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY; - } - - if (cstate_probe_msr(core_msr, core_events_attrs, PERF_CSTATE_CORE_EVENT_MAX)) - has_cstate_core = true; - - if (cstate_probe_msr(pkg_msr, pkg_events_attrs, PERF_CSTATE_PKG_EVENT_MAX)) - has_cstate_pkg = true; - - return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV; -} - -static void __init cstate_cpumask_init(void) -{ - int cpu; - - cpu_notifier_register_begin(); - - for_each_online_cpu(cpu) - cstate_cpu_init(cpu); - - __perf_cpu_notifier(cstate_cpu_notifier); - - cpu_notifier_register_done(); -} +static struct notifier_block cstate_cpu_nb = { + .notifier_call = cstate_cpu_notifier, + .priority = CPU_PRI_PERF + 1, +}; static struct pmu cstate_core_pmu = { .attr_groups = core_attr_groups, .name = "cstate_core", .task_ctx_nr = perf_invalid_context, .event_init = cstate_pmu_event_init, - .add = cstate_pmu_event_add, /* must have */ - .del = cstate_pmu_event_del, /* must have */ + .add = cstate_pmu_event_add, + .del = cstate_pmu_event_del, .start = cstate_pmu_event_start, .stop = cstate_pmu_event_stop, .read = cstate_pmu_event_update, @@ -646,49 +457,203 @@ static struct pmu cstate_pkg_pmu = { .name = "cstate_pkg", .task_ctx_nr = perf_invalid_context, .event_init = cstate_pmu_event_init, - .add = cstate_pmu_event_add, /* must have */ - .del = cstate_pmu_event_del, /* must have */ + .add = cstate_pmu_event_add, + .del = cstate_pmu_event_del, .start = cstate_pmu_event_start, .stop = cstate_pmu_event_stop, .read = cstate_pmu_event_update, .capabilities = PERF_PMU_CAP_NO_INTERRUPT, }; -static void __init cstate_pmus_register(void) +static const struct cstate_model nhm_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C3_RES) | + BIT(PERF_CSTATE_CORE_C6_RES), + + .pkg_events = BIT(PERF_CSTATE_PKG_C3_RES) | + BIT(PERF_CSTATE_PKG_C6_RES) | + BIT(PERF_CSTATE_PKG_C7_RES), +}; + +static const struct cstate_model snb_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C3_RES) | + BIT(PERF_CSTATE_CORE_C6_RES) | + BIT(PERF_CSTATE_CORE_C7_RES), + + .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) | + BIT(PERF_CSTATE_PKG_C3_RES) | + BIT(PERF_CSTATE_PKG_C6_RES) | + BIT(PERF_CSTATE_PKG_C7_RES), +}; + +static const struct cstate_model hswult_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C3_RES) | + BIT(PERF_CSTATE_CORE_C6_RES) | + BIT(PERF_CSTATE_CORE_C7_RES), + + .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) | + BIT(PERF_CSTATE_PKG_C3_RES) | + BIT(PERF_CSTATE_PKG_C6_RES) | + BIT(PERF_CSTATE_PKG_C7_RES) | + BIT(PERF_CSTATE_PKG_C8_RES) | + BIT(PERF_CSTATE_PKG_C9_RES) | + BIT(PERF_CSTATE_PKG_C10_RES), +}; + +static const struct cstate_model slm_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | + BIT(PERF_CSTATE_CORE_C6_RES), + + .pkg_events = BIT(PERF_CSTATE_PKG_C6_RES), + .quirks = SLM_PKG_C6_USE_C7_MSR, +}; + +#define X86_CSTATES_MODEL(model, states) \ + { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long) &(states) } + +static const struct x86_cpu_id intel_cstates_match[] __initconst = { + X86_CSTATES_MODEL(30, nhm_cstates), /* 45nm Nehalem */ + X86_CSTATES_MODEL(26, nhm_cstates), /* 45nm Nehalem-EP */ + X86_CSTATES_MODEL(46, nhm_cstates), /* 45nm Nehalem-EX */ + + X86_CSTATES_MODEL(37, nhm_cstates), /* 32nm Westmere */ + X86_CSTATES_MODEL(44, nhm_cstates), /* 32nm Westmere-EP */ + X86_CSTATES_MODEL(47, nhm_cstates), /* 32nm Westmere-EX */ + + X86_CSTATES_MODEL(42, snb_cstates), /* 32nm SandyBridge */ + X86_CSTATES_MODEL(45, snb_cstates), /* 32nm SandyBridge-E/EN/EP */ + + X86_CSTATES_MODEL(58, snb_cstates), /* 22nm IvyBridge */ + X86_CSTATES_MODEL(62, snb_cstates), /* 22nm IvyBridge-EP/EX */ + + X86_CSTATES_MODEL(60, snb_cstates), /* 22nm Haswell Core */ + X86_CSTATES_MODEL(63, snb_cstates), /* 22nm Haswell Server */ + X86_CSTATES_MODEL(70, snb_cstates), /* 22nm Haswell + GT3e */ + + X86_CSTATES_MODEL(69, hswult_cstates), /* 22nm Haswell ULT */ + + X86_CSTATES_MODEL(55, slm_cstates), /* 22nm Atom Silvermont */ + X86_CSTATES_MODEL(77, slm_cstates), /* 22nm Atom Avoton/Rangely */ + X86_CSTATES_MODEL(76, slm_cstates), /* 22nm Atom Airmont */ + + X86_CSTATES_MODEL(61, snb_cstates), /* 14nm Broadwell Core-M */ + X86_CSTATES_MODEL(86, snb_cstates), /* 14nm Broadwell Xeon D */ + X86_CSTATES_MODEL(71, snb_cstates), /* 14nm Broadwell + GT3e */ + X86_CSTATES_MODEL(79, snb_cstates), /* 14nm Broadwell Server */ + + X86_CSTATES_MODEL(78, snb_cstates), /* 14nm Skylake Mobile */ + X86_CSTATES_MODEL(94, snb_cstates), /* 14nm Skylake Desktop */ + { }, +}; +MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); + +/* + * Probe the cstate events and insert the available one into sysfs attrs + * Return false if there are no available events. + */ +static bool __init cstate_probe_msr(const unsigned long evmsk, int max, + struct perf_cstate_msr *msr, + struct attribute **attrs) { - int err; + bool found = false; + unsigned int bit; + u64 val; + + for (bit = 0; bit < max; bit++) { + if (test_bit(bit, &evmsk) && !rdmsrl_safe(msr[bit].msr, &val)) { + *attrs++ = &msr[bit].attr->attr.attr; + found = true; + } else { + msr[bit].attr = NULL; + } + } + *attrs = NULL; + + return found; +} + +static int __init cstate_probe(const struct cstate_model *cm) +{ + /* SLM has different MSR for PKG C6 */ + if (cm->quirks & SLM_PKG_C6_USE_C7_MSR) + pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY; + + has_cstate_core = cstate_probe_msr(cm->core_events, + PERF_CSTATE_CORE_EVENT_MAX, + core_msr, core_events_attrs); + + has_cstate_pkg = cstate_probe_msr(cm->pkg_events, + PERF_CSTATE_PKG_EVENT_MAX, + pkg_msr, pkg_events_attrs); + + return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV; +} + +static inline void cstate_cleanup(void) +{ + if (has_cstate_core) + perf_pmu_unregister(&cstate_core_pmu); + + if (has_cstate_pkg) + perf_pmu_unregister(&cstate_pkg_pmu); +} + +static int __init cstate_init(void) +{ + int cpu, err; + + cpu_notifier_register_begin(); + for_each_online_cpu(cpu) + cstate_cpu_init(cpu); if (has_cstate_core) { err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1); - if (WARN_ON(err)) - pr_info("Failed to register PMU %s error %d\n", - cstate_core_pmu.name, err); + if (err) { + has_cstate_core = false; + pr_info("Failed to register cstate core pmu\n"); + goto out; + } } if (has_cstate_pkg) { err = perf_pmu_register(&cstate_pkg_pmu, cstate_pkg_pmu.name, -1); - if (WARN_ON(err)) - pr_info("Failed to register PMU %s error %d\n", - cstate_pkg_pmu.name, err); + if (err) { + has_cstate_pkg = false; + pr_info("Failed to register cstate pkg pmu\n"); + cstate_cleanup(); + goto out; + } } + __register_cpu_notifier(&cstate_cpu_nb); +out: + cpu_notifier_register_done(); + return err; } static int __init cstate_pmu_init(void) { + const struct x86_cpu_id *id; int err; - if (cpu_has_hypervisor) + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return -ENODEV; + + id = x86_match_cpu(intel_cstates_match); + if (!id) return -ENODEV; - err = cstate_init(); + err = cstate_probe((const struct cstate_model *) id->driver_data); if (err) return err; - cstate_cpumask_init(); - - cstate_pmus_register(); - - return 0; + return cstate_init(); } +module_init(cstate_pmu_init); -device_initcall(cstate_pmu_init); +static void __exit cstate_pmu_exit(void) +{ + cpu_notifier_register_begin(); + __unregister_cpu_notifier(&cstate_cpu_nb); + cstate_cleanup(); + cpu_notifier_register_done(); +} +module_exit(cstate_pmu_exit); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 8584b90d8e0b..7ce9f3f669e6 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -645,6 +645,12 @@ struct event_constraint intel_slm_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; +struct event_constraint intel_glm_pebs_event_constraints[] = { + /* Allow all events as PEBS with no flags */ + INTEL_ALL_EVENT_CONSTRAINT(0, 0x1), + EVENT_CONSTRAINT_END +}; + struct event_constraint intel_nehalem_pebs_event_constraints[] = { INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 1ca5d1e7d4f2..9e2b40cdb05f 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -14,7 +14,8 @@ enum { LBR_FORMAT_EIP_FLAGS = 0x03, LBR_FORMAT_EIP_FLAGS2 = 0x04, LBR_FORMAT_INFO = 0x05, - LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_INFO, + LBR_FORMAT_TIME = 0x06, + LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_TIME, }; static enum { @@ -464,6 +465,16 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) abort = !!(info & LBR_INFO_ABORT); cycles = (info & LBR_INFO_CYCLES); } + + if (lbr_format == LBR_FORMAT_TIME) { + mis = !!(from & LBR_FROM_FLAG_MISPRED); + pred = !mis; + skip = 1; + cycles = ((to >> 48) & LBR_INFO_CYCLES); + + to = (u64)((((s64)to) << 16) >> 16); + } + if (lbr_flags & LBR_EIP_FLAGS) { mis = !!(from & LBR_FROM_FLAG_MISPRED); pred = !mis; @@ -1049,6 +1060,24 @@ void __init intel_pmu_lbr_init_atom(void) pr_cont("8-deep LBR, "); } +/* slm */ +void __init intel_pmu_lbr_init_slm(void) +{ + x86_pmu.lbr_nr = 8; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_CORE_FROM; + x86_pmu.lbr_to = MSR_LBR_CORE_TO; + + x86_pmu.lbr_sel_mask = LBR_SEL_MASK; + x86_pmu.lbr_sel_map = nhm_lbr_sel_map; + + /* + * SW branch filter usage: + * - compensate for lack of HW filter + */ + pr_cont("8-deep LBR, "); +} + /* Knights Landing */ void intel_pmu_lbr_init_knl(void) { diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 09a77dbc73c9..04bb5fb5a8d7 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -67,11 +67,13 @@ static struct pt_cap_desc { PT_CAP(max_subleaf, 0, CR_EAX, 0xffffffff), PT_CAP(cr3_filtering, 0, CR_EBX, BIT(0)), PT_CAP(psb_cyc, 0, CR_EBX, BIT(1)), + PT_CAP(ip_filtering, 0, CR_EBX, BIT(2)), PT_CAP(mtc, 0, CR_EBX, BIT(3)), PT_CAP(topa_output, 0, CR_ECX, BIT(0)), PT_CAP(topa_multiple_entries, 0, CR_ECX, BIT(1)), PT_CAP(single_range_output, 0, CR_ECX, BIT(2)), PT_CAP(payloads_lip, 0, CR_ECX, BIT(31)), + PT_CAP(num_address_ranges, 1, CR_EAX, 0x3), PT_CAP(mtc_periods, 1, CR_EAX, 0xffff0000), PT_CAP(cycle_thresholds, 1, CR_EBX, 0xffff), PT_CAP(psb_periods, 1, CR_EBX, 0xffff0000), @@ -125,9 +127,46 @@ static struct attribute_group pt_format_group = { .attrs = pt_formats_attr, }; +static ssize_t +pt_timing_attr_show(struct device *dev, struct device_attribute *attr, + char *page) +{ + struct perf_pmu_events_attr *pmu_attr = + container_of(attr, struct perf_pmu_events_attr, attr); + + switch (pmu_attr->id) { + case 0: + return sprintf(page, "%lu\n", pt_pmu.max_nonturbo_ratio); + case 1: + return sprintf(page, "%u:%u\n", + pt_pmu.tsc_art_num, + pt_pmu.tsc_art_den); + default: + break; + } + + return -EINVAL; +} + +PMU_EVENT_ATTR(max_nonturbo_ratio, timing_attr_max_nonturbo_ratio, 0, + pt_timing_attr_show); +PMU_EVENT_ATTR(tsc_art_ratio, timing_attr_tsc_art_ratio, 1, + pt_timing_attr_show); + +static struct attribute *pt_timing_attr[] = { + &timing_attr_max_nonturbo_ratio.attr.attr, + &timing_attr_tsc_art_ratio.attr.attr, + NULL, +}; + +static struct attribute_group pt_timing_group = { + .attrs = pt_timing_attr, +}; + static const struct attribute_group *pt_attr_groups[] = { &pt_cap_group, &pt_format_group, + &pt_timing_group, NULL, }; @@ -140,6 +179,23 @@ static int __init pt_pmu_hw_init(void) int ret; long i; + rdmsrl(MSR_PLATFORM_INFO, reg); + pt_pmu.max_nonturbo_ratio = (reg & 0xff00) >> 8; + + /* + * if available, read in TSC to core crystal clock ratio, + * otherwise, zero for numerator stands for "not enumerated" + * as per SDM + */ + if (boot_cpu_data.cpuid_level >= CPUID_TSC_LEAF) { + u32 eax, ebx, ecx, edx; + + cpuid(CPUID_TSC_LEAF, &eax, &ebx, &ecx, &edx); + + pt_pmu.tsc_art_num = ebx; + pt_pmu.tsc_art_den = eax; + } + if (boot_cpu_has(X86_FEATURE_VMX)) { /* * Intel SDM, 36.5 "Tracing post-VMXON" says that @@ -263,6 +319,75 @@ static bool pt_event_valid(struct perf_event *event) * These all are cpu affine and operate on a local PT */ +/* Address ranges and their corresponding msr configuration registers */ +static const struct pt_address_range { + unsigned long msr_a; + unsigned long msr_b; + unsigned int reg_off; +} pt_address_ranges[] = { + { + .msr_a = MSR_IA32_RTIT_ADDR0_A, + .msr_b = MSR_IA32_RTIT_ADDR0_B, + .reg_off = RTIT_CTL_ADDR0_OFFSET, + }, + { + .msr_a = MSR_IA32_RTIT_ADDR1_A, + .msr_b = MSR_IA32_RTIT_ADDR1_B, + .reg_off = RTIT_CTL_ADDR1_OFFSET, + }, + { + .msr_a = MSR_IA32_RTIT_ADDR2_A, + .msr_b = MSR_IA32_RTIT_ADDR2_B, + .reg_off = RTIT_CTL_ADDR2_OFFSET, + }, + { + .msr_a = MSR_IA32_RTIT_ADDR3_A, + .msr_b = MSR_IA32_RTIT_ADDR3_B, + .reg_off = RTIT_CTL_ADDR3_OFFSET, + } +}; + +static u64 pt_config_filters(struct perf_event *event) +{ + struct pt_filters *filters = event->hw.addr_filters; + struct pt *pt = this_cpu_ptr(&pt_ctx); + unsigned int range = 0; + u64 rtit_ctl = 0; + + if (!filters) + return 0; + + perf_event_addr_filters_sync(event); + + for (range = 0; range < filters->nr_filters; range++) { + struct pt_filter *filter = &filters->filter[range]; + + /* + * Note, if the range has zero start/end addresses due + * to its dynamic object not being loaded yet, we just + * go ahead and program zeroed range, which will simply + * produce no data. Note^2: if executable code at 0x0 + * is a concern, we can set up an "invalid" configuration + * such as msr_b < msr_a. + */ + + /* avoid redundant msr writes */ + if (pt->filters.filter[range].msr_a != filter->msr_a) { + wrmsrl(pt_address_ranges[range].msr_a, filter->msr_a); + pt->filters.filter[range].msr_a = filter->msr_a; + } + + if (pt->filters.filter[range].msr_b != filter->msr_b) { + wrmsrl(pt_address_ranges[range].msr_b, filter->msr_b); + pt->filters.filter[range].msr_b = filter->msr_b; + } + + rtit_ctl |= filter->config << pt_address_ranges[range].reg_off; + } + + return rtit_ctl; +} + static void pt_config(struct perf_event *event) { u64 reg; @@ -272,7 +397,8 @@ static void pt_config(struct perf_event *event) wrmsrl(MSR_IA32_RTIT_STATUS, 0); } - reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN; + reg = pt_config_filters(event); + reg |= RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN; if (!event->attr.exclude_kernel) reg |= RTIT_CTL_OS; @@ -709,6 +835,7 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf, /* clear STOP and INT from current entry */ buf->topa_index[buf->stop_pos]->stop = 0; + buf->topa_index[buf->stop_pos]->intr = 0; buf->topa_index[buf->intr_pos]->intr = 0; /* how many pages till the STOP marker */ @@ -733,6 +860,7 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf, buf->intr_pos = idx; buf->topa_index[buf->stop_pos]->stop = 1; + buf->topa_index[buf->stop_pos]->intr = 1; buf->topa_index[buf->intr_pos]->intr = 1; return 0; @@ -919,24 +1047,80 @@ static void pt_buffer_free_aux(void *data) kfree(buf); } -/** - * pt_buffer_is_full() - check if the buffer is full - * @buf: PT buffer. - * @pt: Per-cpu pt handle. - * - * If the user hasn't read data from the output region that aux_head - * points to, the buffer is considered full: the user needs to read at - * least this region and update aux_tail to point past it. - */ -static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt) +static int pt_addr_filters_init(struct perf_event *event) { - if (buf->snapshot) - return false; + struct pt_filters *filters; + int node = event->cpu == -1 ? -1 : cpu_to_node(event->cpu); + + if (!pt_cap_get(PT_CAP_num_address_ranges)) + return 0; + + filters = kzalloc_node(sizeof(struct pt_filters), GFP_KERNEL, node); + if (!filters) + return -ENOMEM; + + if (event->parent) + memcpy(filters, event->parent->hw.addr_filters, + sizeof(*filters)); + + event->hw.addr_filters = filters; + + return 0; +} + +static void pt_addr_filters_fini(struct perf_event *event) +{ + kfree(event->hw.addr_filters); + event->hw.addr_filters = NULL; +} + +static int pt_event_addr_filters_validate(struct list_head *filters) +{ + struct perf_addr_filter *filter; + int range = 0; + + list_for_each_entry(filter, filters, entry) { + /* PT doesn't support single address triggers */ + if (!filter->range) + return -EOPNOTSUPP; + + if (!filter->inode && !kernel_ip(filter->offset)) + return -EINVAL; + + if (++range > pt_cap_get(PT_CAP_num_address_ranges)) + return -EOPNOTSUPP; + } + + return 0; +} + +static void pt_event_addr_filters_sync(struct perf_event *event) +{ + struct perf_addr_filters_head *head = perf_event_addr_filters(event); + unsigned long msr_a, msr_b, *offs = event->addr_filters_offs; + struct pt_filters *filters = event->hw.addr_filters; + struct perf_addr_filter *filter; + int range = 0; + + if (!filters) + return; - if (local_read(&buf->data_size) >= pt->handle.size) - return true; + list_for_each_entry(filter, &head->list, entry) { + if (filter->inode && !offs[range]) { + msr_a = msr_b = 0; + } else { + /* apply the offset */ + msr_a = filter->offset + offs[range]; + msr_b = filter->size + msr_a; + } + + filters->filter[range].msr_a = msr_a; + filters->filter[range].msr_b = msr_b; + filters->filter[range].config = filter->filter ? 1 : 2; + range++; + } - return false; + filters->nr_filters = range; } /** @@ -953,7 +1137,7 @@ void intel_pt_interrupt(void) * after PT has been disabled by pt_event_stop(). Make sure we don't * do anything (particularly, re-enable) for this event here. */ - if (!ACCESS_ONCE(pt->handle_nmi)) + if (!READ_ONCE(pt->handle_nmi)) return; /* @@ -1038,23 +1222,36 @@ EXPORT_SYMBOL_GPL(intel_pt_handle_vmx); static void pt_event_start(struct perf_event *event, int mode) { + struct hw_perf_event *hwc = &event->hw; struct pt *pt = this_cpu_ptr(&pt_ctx); - struct pt_buffer *buf = perf_get_aux(&pt->handle); + struct pt_buffer *buf; if (READ_ONCE(pt->vmx_on)) return; - if (!buf || pt_buffer_is_full(buf, pt)) { - event->hw.state = PERF_HES_STOPPED; - return; + buf = perf_aux_output_begin(&pt->handle, event); + if (!buf) + goto fail_stop; + + pt_buffer_reset_offsets(buf, pt->handle.head); + if (!buf->snapshot) { + if (pt_buffer_reset_markers(buf, &pt->handle)) + goto fail_end_stop; } - ACCESS_ONCE(pt->handle_nmi) = 1; - event->hw.state = 0; + WRITE_ONCE(pt->handle_nmi, 1); + hwc->state = 0; pt_config_buffer(buf->cur->table, buf->cur_idx, buf->output_off); pt_config(event); + + return; + +fail_end_stop: + perf_aux_output_end(&pt->handle, 0, true); +fail_stop: + hwc->state = PERF_HES_STOPPED; } static void pt_event_stop(struct perf_event *event, int mode) @@ -1065,7 +1262,7 @@ static void pt_event_stop(struct perf_event *event, int mode) * Protect against the PMI racing with disabling wrmsr, * see comment in intel_pt_interrupt(). */ - ACCESS_ONCE(pt->handle_nmi) = 0; + WRITE_ONCE(pt->handle_nmi, 0); pt_config_stop(event); @@ -1088,19 +1285,7 @@ static void pt_event_stop(struct perf_event *event, int mode) pt_handle_status(pt); pt_update_head(pt); - } -} - -static void pt_event_del(struct perf_event *event, int mode) -{ - struct pt *pt = this_cpu_ptr(&pt_ctx); - struct pt_buffer *buf; - pt_event_stop(event, PERF_EF_UPDATE); - - buf = perf_get_aux(&pt->handle); - - if (buf) { if (buf->snapshot) pt->handle.head = local_xchg(&buf->data_size, @@ -1110,9 +1295,13 @@ static void pt_event_del(struct perf_event *event, int mode) } } +static void pt_event_del(struct perf_event *event, int mode) +{ + pt_event_stop(event, PERF_EF_UPDATE); +} + static int pt_event_add(struct perf_event *event, int mode) { - struct pt_buffer *buf; struct pt *pt = this_cpu_ptr(&pt_ctx); struct hw_perf_event *hwc = &event->hw; int ret = -EBUSY; @@ -1120,34 +1309,18 @@ static int pt_event_add(struct perf_event *event, int mode) if (pt->handle.event) goto fail; - buf = perf_aux_output_begin(&pt->handle, event); - ret = -EINVAL; - if (!buf) - goto fail_stop; - - pt_buffer_reset_offsets(buf, pt->handle.head); - if (!buf->snapshot) { - ret = pt_buffer_reset_markers(buf, &pt->handle); - if (ret) - goto fail_end_stop; - } - if (mode & PERF_EF_START) { pt_event_start(event, 0); - ret = -EBUSY; + ret = -EINVAL; if (hwc->state == PERF_HES_STOPPED) - goto fail_end_stop; + goto fail; } else { hwc->state = PERF_HES_STOPPED; } - return 0; - -fail_end_stop: - perf_aux_output_end(&pt->handle, 0, true); -fail_stop: - hwc->state = PERF_HES_STOPPED; + ret = 0; fail: + return ret; } @@ -1157,6 +1330,7 @@ static void pt_event_read(struct perf_event *event) static void pt_event_destroy(struct perf_event *event) { + pt_addr_filters_fini(event); x86_del_exclusive(x86_lbr_exclusive_pt); } @@ -1171,6 +1345,11 @@ static int pt_event_init(struct perf_event *event) if (x86_add_exclusive(x86_lbr_exclusive_pt)) return -EBUSY; + if (pt_addr_filters_init(event)) { + x86_del_exclusive(x86_lbr_exclusive_pt); + return -ENOMEM; + } + event->destroy = pt_event_destroy; return 0; @@ -1190,7 +1369,7 @@ static __init int pt_init(void) BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE); - if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT)) + if (!boot_cpu_has(X86_FEATURE_INTEL_PT)) return -ENODEV; get_online_cpus(); @@ -1224,16 +1403,21 @@ static __init int pt_init(void) PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF; pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE; - pt_pmu.pmu.attr_groups = pt_attr_groups; - pt_pmu.pmu.task_ctx_nr = perf_sw_context; - pt_pmu.pmu.event_init = pt_event_init; - pt_pmu.pmu.add = pt_event_add; - pt_pmu.pmu.del = pt_event_del; - pt_pmu.pmu.start = pt_event_start; - pt_pmu.pmu.stop = pt_event_stop; - pt_pmu.pmu.read = pt_event_read; - pt_pmu.pmu.setup_aux = pt_buffer_setup_aux; - pt_pmu.pmu.free_aux = pt_buffer_free_aux; + pt_pmu.pmu.attr_groups = pt_attr_groups; + pt_pmu.pmu.task_ctx_nr = perf_sw_context; + pt_pmu.pmu.event_init = pt_event_init; + pt_pmu.pmu.add = pt_event_add; + pt_pmu.pmu.del = pt_event_del; + pt_pmu.pmu.start = pt_event_start; + pt_pmu.pmu.stop = pt_event_stop; + pt_pmu.pmu.read = pt_event_read; + pt_pmu.pmu.setup_aux = pt_buffer_setup_aux; + pt_pmu.pmu.free_aux = pt_buffer_free_aux; + pt_pmu.pmu.addr_filters_sync = pt_event_addr_filters_sync; + pt_pmu.pmu.addr_filters_validate = pt_event_addr_filters_validate; + pt_pmu.pmu.nr_addr_filters = + pt_cap_get(PT_CAP_num_address_ranges); + ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1); return ret; diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h index 3abb5f5cccc8..efffa4a09f68 100644 --- a/arch/x86/events/intel/pt.h +++ b/arch/x86/events/intel/pt.h @@ -20,6 +20,40 @@ #define __INTEL_PT_H__ /* + * PT MSR bit definitions + */ +#define RTIT_CTL_TRACEEN BIT(0) +#define RTIT_CTL_CYCLEACC BIT(1) +#define RTIT_CTL_OS BIT(2) +#define RTIT_CTL_USR BIT(3) +#define RTIT_CTL_CR3EN BIT(7) +#define RTIT_CTL_TOPA BIT(8) +#define RTIT_CTL_MTC_EN BIT(9) +#define RTIT_CTL_TSC_EN BIT(10) +#define RTIT_CTL_DISRETC BIT(11) +#define RTIT_CTL_BRANCH_EN BIT(13) +#define RTIT_CTL_MTC_RANGE_OFFSET 14 +#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET) +#define RTIT_CTL_CYC_THRESH_OFFSET 19 +#define RTIT_CTL_CYC_THRESH (0x0full << RTIT_CTL_CYC_THRESH_OFFSET) +#define RTIT_CTL_PSB_FREQ_OFFSET 24 +#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET) +#define RTIT_CTL_ADDR0_OFFSET 32 +#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET) +#define RTIT_CTL_ADDR1_OFFSET 36 +#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET) +#define RTIT_CTL_ADDR2_OFFSET 40 +#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET) +#define RTIT_CTL_ADDR3_OFFSET 44 +#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET) +#define RTIT_STATUS_FILTEREN BIT(0) +#define RTIT_STATUS_CONTEXTEN BIT(1) +#define RTIT_STATUS_TRIGGEREN BIT(2) +#define RTIT_STATUS_BUFFOVF BIT(3) +#define RTIT_STATUS_ERROR BIT(4) +#define RTIT_STATUS_STOPPED BIT(5) + +/* * Single-entry ToPA: when this close to region boundary, switch * buffers to avoid losing data. */ @@ -48,15 +82,20 @@ struct topa_entry { #define PT_CPUID_LEAVES 2 #define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */ +/* TSC to Core Crystal Clock Ratio */ +#define CPUID_TSC_LEAF 0x15 + enum pt_capabilities { PT_CAP_max_subleaf = 0, PT_CAP_cr3_filtering, PT_CAP_psb_cyc, + PT_CAP_ip_filtering, PT_CAP_mtc, PT_CAP_topa_output, PT_CAP_topa_multiple_entries, PT_CAP_single_range_output, PT_CAP_payloads_lip, + PT_CAP_num_address_ranges, PT_CAP_mtc_periods, PT_CAP_cycle_thresholds, PT_CAP_psb_periods, @@ -66,6 +105,9 @@ struct pt_pmu { struct pmu pmu; u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; bool vmx; + unsigned long max_nonturbo_ratio; + unsigned int tsc_art_num; + unsigned int tsc_art_den; }; /** @@ -104,14 +146,40 @@ struct pt_buffer { struct topa_entry *topa_index[0]; }; +#define PT_FILTERS_NUM 4 + +/** + * struct pt_filter - IP range filter configuration + * @msr_a: range start, goes to RTIT_ADDRn_A + * @msr_b: range end, goes to RTIT_ADDRn_B + * @config: 4-bit field in RTIT_CTL + */ +struct pt_filter { + unsigned long msr_a; + unsigned long msr_b; + unsigned long config; +}; + +/** + * struct pt_filters - IP range filtering context + * @filter: filters defined for this context + * @nr_filters: number of defined filters in the @filter array + */ +struct pt_filters { + struct pt_filter filter[PT_FILTERS_NUM]; + unsigned int nr_filters; +}; + /** * struct pt - per-cpu pt context * @handle: perf output handle + * @filters: last configured filters * @handle_nmi: do handle PT PMI on this cpu, there's an active event * @vmx_on: 1 if VMX is ON on this cpu */ struct pt { struct perf_output_handle handle; + struct pt_filters filters; int handle_nmi; int vmx_on; }; diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 1705c9d75e44..99c4bab123cd 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -27,10 +27,14 @@ * event: rapl_energy_dram * perf code: 0x3 * - * dram counter: consumption of the builtin-gpu domain (client only) + * gpu counter: consumption of the builtin-gpu domain (client only) * event: rapl_energy_gpu * perf code: 0x4 * + * psys counter: consumption of the builtin-psys domain (client only) + * event: rapl_energy_psys + * perf code: 0x5 + * * We manage those counters as free running (read-only). They may be * use simultaneously by other tools, such as turbostat. * @@ -53,6 +57,8 @@ #include <asm/cpu_device_id.h> #include "../perf_event.h" +MODULE_LICENSE("GPL"); + /* * RAPL energy status counters */ @@ -64,13 +70,16 @@ #define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */ #define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */ #define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */ +#define RAPL_IDX_PSYS_NRG_STAT 4 /* psys */ +#define INTEL_RAPL_PSYS 0x5 /* pseudo-encoding */ -#define NR_RAPL_DOMAINS 0x4 +#define NR_RAPL_DOMAINS 0x5 static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { "pp0-core", "package", "dram", "pp1-gpu", + "psys", }; /* Clients have PP0, PKG */ @@ -89,6 +98,13 @@ static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { 1<<RAPL_IDX_RAM_NRG_STAT|\ 1<<RAPL_IDX_PP1_NRG_STAT) +/* SKL clients have PP0, PKG, RAM, PP1, PSYS */ +#define RAPL_IDX_SKL_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\ + 1<<RAPL_IDX_PKG_NRG_STAT|\ + 1<<RAPL_IDX_RAM_NRG_STAT|\ + 1<<RAPL_IDX_PP1_NRG_STAT|\ + 1<<RAPL_IDX_PSYS_NRG_STAT) + /* Knights Landing has PKG, RAM */ #define RAPL_IDX_KNL (1<<RAPL_IDX_PKG_NRG_STAT|\ 1<<RAPL_IDX_RAM_NRG_STAT) @@ -360,6 +376,10 @@ static int rapl_pmu_event_init(struct perf_event *event) bit = RAPL_IDX_PP1_NRG_STAT; msr = MSR_PP1_ENERGY_STATUS; break; + case INTEL_RAPL_PSYS: + bit = RAPL_IDX_PSYS_NRG_STAT; + msr = MSR_PLATFORM_ENERGY_STATUS; + break; default: return -EINVAL; } @@ -414,11 +434,13 @@ RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02"); RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04"); +RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05"); RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules"); RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules"); RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules"); +RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_psys_unit, "Joules"); /* * we compute in 0.23 nJ increments regardless of MSR @@ -427,6 +449,7 @@ RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890 RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10"); RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10"); RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10"); static struct attribute *rapl_events_srv_attr[] = { EVENT_PTR(rapl_cores), @@ -476,6 +499,27 @@ static struct attribute *rapl_events_hsw_attr[] = { NULL, }; +static struct attribute *rapl_events_skl_attr[] = { + EVENT_PTR(rapl_cores), + EVENT_PTR(rapl_pkg), + EVENT_PTR(rapl_gpu), + EVENT_PTR(rapl_ram), + EVENT_PTR(rapl_psys), + + EVENT_PTR(rapl_cores_unit), + EVENT_PTR(rapl_pkg_unit), + EVENT_PTR(rapl_gpu_unit), + EVENT_PTR(rapl_ram_unit), + EVENT_PTR(rapl_psys_unit), + + EVENT_PTR(rapl_cores_scale), + EVENT_PTR(rapl_pkg_scale), + EVENT_PTR(rapl_gpu_scale), + EVENT_PTR(rapl_ram_scale), + EVENT_PTR(rapl_psys_scale), + NULL, +}; + static struct attribute *rapl_events_knl_attr[] = { EVENT_PTR(rapl_pkg), EVENT_PTR(rapl_ram), @@ -592,6 +636,11 @@ static int rapl_cpu_notifier(struct notifier_block *self, return NOTIFY_OK; } +static struct notifier_block rapl_cpu_nb = { + .notifier_call = rapl_cpu_notifier, + .priority = CPU_PRI_PERF + 1, +}; + static int rapl_check_hw_unit(bool apply_quirk) { u64 msr_rapl_power_unit_bits; @@ -660,7 +709,7 @@ static int __init rapl_prepare_cpus(void) return 0; } -static void __init cleanup_rapl_pmus(void) +static void cleanup_rapl_pmus(void) { int i; @@ -691,52 +740,92 @@ static int __init init_rapl_pmus(void) return 0; } +#define X86_RAPL_MODEL_MATCH(model, init) \ + { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&init } + +struct intel_rapl_init_fun { + bool apply_quirk; + int cntr_mask; + struct attribute **attrs; +}; + +static const struct intel_rapl_init_fun snb_rapl_init __initconst = { + .apply_quirk = false, + .cntr_mask = RAPL_IDX_CLN, + .attrs = rapl_events_cln_attr, +}; + +static const struct intel_rapl_init_fun hsx_rapl_init __initconst = { + .apply_quirk = true, + .cntr_mask = RAPL_IDX_SRV, + .attrs = rapl_events_srv_attr, +}; + +static const struct intel_rapl_init_fun hsw_rapl_init __initconst = { + .apply_quirk = false, + .cntr_mask = RAPL_IDX_HSW, + .attrs = rapl_events_hsw_attr, +}; + +static const struct intel_rapl_init_fun snbep_rapl_init __initconst = { + .apply_quirk = false, + .cntr_mask = RAPL_IDX_SRV, + .attrs = rapl_events_srv_attr, +}; + +static const struct intel_rapl_init_fun knl_rapl_init __initconst = { + .apply_quirk = true, + .cntr_mask = RAPL_IDX_KNL, + .attrs = rapl_events_knl_attr, +}; + +static const struct intel_rapl_init_fun skl_rapl_init __initconst = { + .apply_quirk = false, + .cntr_mask = RAPL_IDX_SKL_CLN, + .attrs = rapl_events_skl_attr, +}; + static const struct x86_cpu_id rapl_cpu_match[] __initconst = { - [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 }, - [1] = {}, + X86_RAPL_MODEL_MATCH(42, snb_rapl_init), /* Sandy Bridge */ + X86_RAPL_MODEL_MATCH(45, snbep_rapl_init), /* Sandy Bridge-EP */ + + X86_RAPL_MODEL_MATCH(58, snb_rapl_init), /* Ivy Bridge */ + X86_RAPL_MODEL_MATCH(62, snbep_rapl_init), /* IvyTown */ + + X86_RAPL_MODEL_MATCH(60, hsw_rapl_init), /* Haswell */ + X86_RAPL_MODEL_MATCH(63, hsx_rapl_init), /* Haswell-Server */ + X86_RAPL_MODEL_MATCH(69, hsw_rapl_init), /* Haswell-Celeron */ + X86_RAPL_MODEL_MATCH(70, hsw_rapl_init), /* Haswell GT3e */ + + X86_RAPL_MODEL_MATCH(61, hsw_rapl_init), /* Broadwell */ + X86_RAPL_MODEL_MATCH(71, hsw_rapl_init), /* Broadwell-H */ + X86_RAPL_MODEL_MATCH(79, hsx_rapl_init), /* Broadwell-Server */ + X86_RAPL_MODEL_MATCH(86, hsx_rapl_init), /* Broadwell Xeon D */ + + X86_RAPL_MODEL_MATCH(87, knl_rapl_init), /* Knights Landing */ + + X86_RAPL_MODEL_MATCH(78, skl_rapl_init), /* Skylake */ + X86_RAPL_MODEL_MATCH(94, skl_rapl_init), /* Skylake H/S */ + {}, }; +MODULE_DEVICE_TABLE(x86cpu, rapl_cpu_match); + static int __init rapl_pmu_init(void) { - bool apply_quirk = false; + const struct x86_cpu_id *id; + struct intel_rapl_init_fun *rapl_init; + bool apply_quirk; int ret; - if (!x86_match_cpu(rapl_cpu_match)) + id = x86_match_cpu(rapl_cpu_match); + if (!id) return -ENODEV; - switch (boot_cpu_data.x86_model) { - case 42: /* Sandy Bridge */ - case 58: /* Ivy Bridge */ - rapl_cntr_mask = RAPL_IDX_CLN; - rapl_pmu_events_group.attrs = rapl_events_cln_attr; - break; - case 63: /* Haswell-Server */ - case 79: /* Broadwell-Server */ - apply_quirk = true; - rapl_cntr_mask = RAPL_IDX_SRV; - rapl_pmu_events_group.attrs = rapl_events_srv_attr; - break; - case 60: /* Haswell */ - case 69: /* Haswell-Celeron */ - case 70: /* Haswell GT3e */ - case 61: /* Broadwell */ - case 71: /* Broadwell-H */ - rapl_cntr_mask = RAPL_IDX_HSW; - rapl_pmu_events_group.attrs = rapl_events_hsw_attr; - break; - case 45: /* Sandy Bridge-EP */ - case 62: /* IvyTown */ - rapl_cntr_mask = RAPL_IDX_SRV; - rapl_pmu_events_group.attrs = rapl_events_srv_attr; - break; - case 87: /* Knights Landing */ - apply_quirk = true; - rapl_cntr_mask = RAPL_IDX_KNL; - rapl_pmu_events_group.attrs = rapl_events_knl_attr; - break; - default: - return -ENODEV; - } + rapl_init = (struct intel_rapl_init_fun *)id->driver_data; + apply_quirk = rapl_init->apply_quirk; + rapl_cntr_mask = rapl_init->cntr_mask; + rapl_pmu_events_group.attrs = rapl_init->attrs; ret = rapl_check_hw_unit(apply_quirk); if (ret) @@ -756,7 +845,7 @@ static int __init rapl_pmu_init(void) if (ret) goto out; - __perf_cpu_notifier(rapl_cpu_notifier); + __register_cpu_notifier(&rapl_cpu_nb); cpu_notifier_register_done(); rapl_advertise(); return 0; @@ -767,4 +856,14 @@ out: cpu_notifier_register_done(); return ret; } -device_initcall(rapl_pmu_init); +module_init(rapl_pmu_init); + +static void __exit intel_rapl_exit(void) +{ + cpu_notifier_register_begin(); + __unregister_cpu_notifier(&rapl_cpu_nb); + perf_pmu_unregister(&rapl_pmus->pmu); + cleanup_rapl_pmus(); + cpu_notifier_register_done(); +} +module_exit(intel_rapl_exit); diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 7012d18bb293..16c178916412 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1,3 +1,4 @@ +#include <asm/cpu_device_id.h> #include "uncore.h" static struct intel_uncore_type *empty_uncore[] = { NULL, }; @@ -21,6 +22,8 @@ static struct event_constraint uncore_constraint_fixed = struct event_constraint uncore_constraint_empty = EVENT_CONSTRAINT(0, 0, 0); +MODULE_LICENSE("GPL"); + static int uncore_pcibus_to_physid(struct pci_bus *bus) { struct pci2phy_map *map; @@ -754,7 +757,7 @@ static void uncore_pmu_unregister(struct intel_uncore_pmu *pmu) pmu->registered = false; } -static void __init __uncore_exit_boxes(struct intel_uncore_type *type, int cpu) +static void __uncore_exit_boxes(struct intel_uncore_type *type, int cpu) { struct intel_uncore_pmu *pmu = type->pmus; struct intel_uncore_box *box; @@ -770,7 +773,7 @@ static void __init __uncore_exit_boxes(struct intel_uncore_type *type, int cpu) } } -static void __init uncore_exit_boxes(void *dummy) +static void uncore_exit_boxes(void *dummy) { struct intel_uncore_type **types; @@ -787,7 +790,7 @@ static void uncore_free_boxes(struct intel_uncore_pmu *pmu) kfree(pmu->boxes); } -static void __init uncore_type_exit(struct intel_uncore_type *type) +static void uncore_type_exit(struct intel_uncore_type *type) { struct intel_uncore_pmu *pmu = type->pmus; int i; @@ -804,7 +807,7 @@ static void __init uncore_type_exit(struct intel_uncore_type *type) type->events_group = NULL; } -static void __init uncore_types_exit(struct intel_uncore_type **types) +static void uncore_types_exit(struct intel_uncore_type **types) { for (; *types; types++) uncore_type_exit(*types); @@ -989,46 +992,6 @@ static int __init uncore_pci_init(void) size_t size; int ret; - switch (boot_cpu_data.x86_model) { - case 45: /* Sandy Bridge-EP */ - ret = snbep_uncore_pci_init(); - break; - case 62: /* Ivy Bridge-EP */ - ret = ivbep_uncore_pci_init(); - break; - case 63: /* Haswell-EP */ - ret = hswep_uncore_pci_init(); - break; - case 79: /* BDX-EP */ - case 86: /* BDX-DE */ - ret = bdx_uncore_pci_init(); - break; - case 42: /* Sandy Bridge */ - ret = snb_uncore_pci_init(); - break; - case 58: /* Ivy Bridge */ - ret = ivb_uncore_pci_init(); - break; - case 60: /* Haswell */ - case 69: /* Haswell Celeron */ - ret = hsw_uncore_pci_init(); - break; - case 61: /* Broadwell */ - ret = bdw_uncore_pci_init(); - break; - case 87: /* Knights Landing */ - ret = knl_uncore_pci_init(); - break; - case 94: /* SkyLake */ - ret = skl_uncore_pci_init(); - break; - default: - return -ENODEV; - } - - if (ret) - return ret; - size = max_packages * sizeof(struct pci_extra_dev); uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL); if (!uncore_extra_pci_dev) { @@ -1060,7 +1023,7 @@ err: return ret; } -static void __init uncore_pci_exit(void) +static void uncore_pci_exit(void) { if (pcidrv_registered) { pcidrv_registered = false; @@ -1287,46 +1250,6 @@ static int __init uncore_cpu_init(void) { int ret; - switch (boot_cpu_data.x86_model) { - case 26: /* Nehalem */ - case 30: - case 37: /* Westmere */ - case 44: - nhm_uncore_cpu_init(); - break; - case 42: /* Sandy Bridge */ - case 58: /* Ivy Bridge */ - case 60: /* Haswell */ - case 69: /* Haswell */ - case 70: /* Haswell */ - case 61: /* Broadwell */ - case 71: /* Broadwell */ - snb_uncore_cpu_init(); - break; - case 45: /* Sandy Bridge-EP */ - snbep_uncore_cpu_init(); - break; - case 46: /* Nehalem-EX */ - case 47: /* Westmere-EX aka. Xeon E7 */ - nhmex_uncore_cpu_init(); - break; - case 62: /* Ivy Bridge-EP */ - ivbep_uncore_cpu_init(); - break; - case 63: /* Haswell-EP */ - hswep_uncore_cpu_init(); - break; - case 79: /* BDX-EP */ - case 86: /* BDX-DE */ - bdx_uncore_cpu_init(); - break; - case 87: /* Knights Landing */ - knl_uncore_cpu_init(); - break; - default: - return -ENODEV; - } - ret = uncore_types_init(uncore_msr_uncores, true); if (ret) goto err; @@ -1376,20 +1299,123 @@ static int __init uncore_cpumask_init(bool msr) return 0; } +#define X86_UNCORE_MODEL_MATCH(model, init) \ + { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&init } + +struct intel_uncore_init_fun { + void (*cpu_init)(void); + int (*pci_init)(void); +}; + +static const struct intel_uncore_init_fun nhm_uncore_init __initconst = { + .cpu_init = nhm_uncore_cpu_init, +}; + +static const struct intel_uncore_init_fun snb_uncore_init __initconst = { + .cpu_init = snb_uncore_cpu_init, + .pci_init = snb_uncore_pci_init, +}; + +static const struct intel_uncore_init_fun ivb_uncore_init __initconst = { + .cpu_init = snb_uncore_cpu_init, + .pci_init = ivb_uncore_pci_init, +}; + +static const struct intel_uncore_init_fun hsw_uncore_init __initconst = { + .cpu_init = snb_uncore_cpu_init, + .pci_init = hsw_uncore_pci_init, +}; + +static const struct intel_uncore_init_fun bdw_uncore_init __initconst = { + .cpu_init = snb_uncore_cpu_init, + .pci_init = bdw_uncore_pci_init, +}; + +static const struct intel_uncore_init_fun snbep_uncore_init __initconst = { + .cpu_init = snbep_uncore_cpu_init, + .pci_init = snbep_uncore_pci_init, +}; + +static const struct intel_uncore_init_fun nhmex_uncore_init __initconst = { + .cpu_init = nhmex_uncore_cpu_init, +}; + +static const struct intel_uncore_init_fun ivbep_uncore_init __initconst = { + .cpu_init = ivbep_uncore_cpu_init, + .pci_init = ivbep_uncore_pci_init, +}; + +static const struct intel_uncore_init_fun hswep_uncore_init __initconst = { + .cpu_init = hswep_uncore_cpu_init, + .pci_init = hswep_uncore_pci_init, +}; + +static const struct intel_uncore_init_fun bdx_uncore_init __initconst = { + .cpu_init = bdx_uncore_cpu_init, + .pci_init = bdx_uncore_pci_init, +}; + +static const struct intel_uncore_init_fun knl_uncore_init __initconst = { + .cpu_init = knl_uncore_cpu_init, + .pci_init = knl_uncore_pci_init, +}; + +static const struct intel_uncore_init_fun skl_uncore_init __initconst = { + .pci_init = skl_uncore_pci_init, +}; + +static const struct x86_cpu_id intel_uncore_match[] __initconst = { + X86_UNCORE_MODEL_MATCH(26, nhm_uncore_init), /* Nehalem */ + X86_UNCORE_MODEL_MATCH(30, nhm_uncore_init), + X86_UNCORE_MODEL_MATCH(37, nhm_uncore_init), /* Westmere */ + X86_UNCORE_MODEL_MATCH(44, nhm_uncore_init), + X86_UNCORE_MODEL_MATCH(42, snb_uncore_init), /* Sandy Bridge */ + X86_UNCORE_MODEL_MATCH(58, ivb_uncore_init), /* Ivy Bridge */ + X86_UNCORE_MODEL_MATCH(60, hsw_uncore_init), /* Haswell */ + X86_UNCORE_MODEL_MATCH(69, hsw_uncore_init), /* Haswell Celeron */ + X86_UNCORE_MODEL_MATCH(70, hsw_uncore_init), /* Haswell */ + X86_UNCORE_MODEL_MATCH(61, bdw_uncore_init), /* Broadwell */ + X86_UNCORE_MODEL_MATCH(71, bdw_uncore_init), /* Broadwell */ + X86_UNCORE_MODEL_MATCH(45, snbep_uncore_init), /* Sandy Bridge-EP */ + X86_UNCORE_MODEL_MATCH(46, nhmex_uncore_init), /* Nehalem-EX */ + X86_UNCORE_MODEL_MATCH(47, nhmex_uncore_init), /* Westmere-EX aka. Xeon E7 */ + X86_UNCORE_MODEL_MATCH(62, ivbep_uncore_init), /* Ivy Bridge-EP */ + X86_UNCORE_MODEL_MATCH(63, hswep_uncore_init), /* Haswell-EP */ + X86_UNCORE_MODEL_MATCH(79, bdx_uncore_init), /* BDX-EP */ + X86_UNCORE_MODEL_MATCH(86, bdx_uncore_init), /* BDX-DE */ + X86_UNCORE_MODEL_MATCH(87, knl_uncore_init), /* Knights Landing */ + X86_UNCORE_MODEL_MATCH(94, skl_uncore_init), /* SkyLake */ + {}, +}; + +MODULE_DEVICE_TABLE(x86cpu, intel_uncore_match); + static int __init intel_uncore_init(void) { - int pret, cret, ret; + const struct x86_cpu_id *id; + struct intel_uncore_init_fun *uncore_init; + int pret = 0, cret = 0, ret; - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + id = x86_match_cpu(intel_uncore_match); + if (!id) return -ENODEV; - if (cpu_has_hypervisor) + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) return -ENODEV; max_packages = topology_max_packages(); - pret = uncore_pci_init(); - cret = uncore_cpu_init(); + uncore_init = (struct intel_uncore_init_fun *)id->driver_data; + if (uncore_init->pci_init) { + pret = uncore_init->pci_init(); + if (!pret) + pret = uncore_pci_init(); + } + + if (uncore_init->cpu_init) { + uncore_init->cpu_init(); + cret = uncore_cpu_init(); + } if (cret && pret) return -ENODEV; @@ -1409,4 +1435,14 @@ err: cpu_notifier_register_done(); return ret; } -device_initcall(intel_uncore_init); +module_init(intel_uncore_init); + +static void __exit intel_uncore_exit(void) +{ + cpu_notifier_register_begin(); + __unregister_cpu_notifier(&uncore_cpu_nb); + uncore_types_exit(uncore_msr_uncores); + uncore_pci_exit(); + cpu_notifier_register_done(); +} +module_exit(intel_uncore_exit); diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index ab2bcaaebe38..b2625867ebd1 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -219,6 +219,9 @@ #define KNL_CHA_MSR_PMON_BOX_FILTER_TID 0x1ff #define KNL_CHA_MSR_PMON_BOX_FILTER_STATE (7 << 18) #define KNL_CHA_MSR_PMON_BOX_FILTER_OP (0xfffffe2aULL << 32) +#define KNL_CHA_MSR_PMON_BOX_FILTER_REMOTE_NODE (0x1ULL << 32) +#define KNL_CHA_MSR_PMON_BOX_FILTER_LOCAL_NODE (0x1ULL << 33) +#define KNL_CHA_MSR_PMON_BOX_FILTER_NNC (0x1ULL << 37) /* KNL EDC/MC UCLK */ #define KNL_UCLK_MSR_PMON_CTR0_LOW 0x400 @@ -1902,6 +1905,10 @@ static int knl_cha_hw_config(struct intel_uncore_box *box, reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 + KNL_CHA_MSR_OFFSET * box->pmu->pmu_idx; reg1->config = event->attr.config1 & knl_cha_filter_mask(idx); + + reg1->config |= KNL_CHA_MSR_PMON_BOX_FILTER_REMOTE_NODE; + reg1->config |= KNL_CHA_MSR_PMON_BOX_FILTER_LOCAL_NODE; + reg1->config |= KNL_CHA_MSR_PMON_BOX_FILTER_NNC; reg1->idx = idx; } return 0; diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index ec863b9a9f78..85ef3c2e80e0 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -6,6 +6,8 @@ enum perf_msr_id { PERF_MSR_MPERF = 2, PERF_MSR_PPERF = 3, PERF_MSR_SMI = 4, + PERF_MSR_PTSC = 5, + PERF_MSR_IRPERF = 6, PERF_MSR_EVENT_MAX, }; @@ -15,6 +17,16 @@ static bool test_aperfmperf(int idx) return boot_cpu_has(X86_FEATURE_APERFMPERF); } +static bool test_ptsc(int idx) +{ + return boot_cpu_has(X86_FEATURE_PTSC); +} + +static bool test_irperf(int idx) +{ + return boot_cpu_has(X86_FEATURE_IRPERF); +} + static bool test_intel(int idx) { if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || @@ -69,18 +81,22 @@ struct perf_msr { bool (*test)(int idx); }; -PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00"); -PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01"); -PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02"); -PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03"); -PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04"); +PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00"); +PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01"); +PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02"); +PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03"); +PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04"); +PMU_EVENT_ATTR_STRING(ptsc, evattr_ptsc, "event=0x05"); +PMU_EVENT_ATTR_STRING(irperf, evattr_irperf, "event=0x06"); static struct perf_msr msr[] = { - [PERF_MSR_TSC] = { 0, &evattr_tsc, NULL, }, - [PERF_MSR_APERF] = { MSR_IA32_APERF, &evattr_aperf, test_aperfmperf, }, - [PERF_MSR_MPERF] = { MSR_IA32_MPERF, &evattr_mperf, test_aperfmperf, }, - [PERF_MSR_PPERF] = { MSR_PPERF, &evattr_pperf, test_intel, }, - [PERF_MSR_SMI] = { MSR_SMI_COUNT, &evattr_smi, test_intel, }, + [PERF_MSR_TSC] = { 0, &evattr_tsc, NULL, }, + [PERF_MSR_APERF] = { MSR_IA32_APERF, &evattr_aperf, test_aperfmperf, }, + [PERF_MSR_MPERF] = { MSR_IA32_MPERF, &evattr_mperf, test_aperfmperf, }, + [PERF_MSR_PPERF] = { MSR_PPERF, &evattr_pperf, test_intel, }, + [PERF_MSR_SMI] = { MSR_SMI_COUNT, &evattr_smi, test_intel, }, + [PERF_MSR_PTSC] = { MSR_F15H_PTSC, &evattr_ptsc, test_ptsc, }, + [PERF_MSR_IRPERF] = { MSR_F17H_IRPERF, &evattr_irperf, test_irperf, }, }; static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = { @@ -166,7 +182,7 @@ again: if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) delta = sign_extend64(delta, 31); - local64_add(now - prev, &event->count); + local64_add(delta, &event->count); } static void msr_event_start(struct perf_event *event, int flags) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index ad4dc7ffffb5..8bd764df815d 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -601,6 +601,7 @@ struct x86_pmu { u64 lbr_sel_mask; /* LBR_SELECT valid bits */ const int *lbr_sel_map; /* lbr_select mappings */ bool lbr_double_abort; /* duplicated lbr aborts */ + bool lbr_pt_coexist; /* LBR may coexist with PT */ /* * Intel PT/LBR/BTS are exclusive @@ -859,6 +860,8 @@ extern struct event_constraint intel_atom_pebs_event_constraints[]; extern struct event_constraint intel_slm_pebs_event_constraints[]; +extern struct event_constraint intel_glm_pebs_event_constraints[]; + extern struct event_constraint intel_nehalem_pebs_event_constraints[]; extern struct event_constraint intel_westmere_pebs_event_constraints[]; @@ -907,6 +910,8 @@ void intel_pmu_lbr_init_nhm(void); void intel_pmu_lbr_init_atom(void); +void intel_pmu_lbr_init_slm(void); + void intel_pmu_lbr_init_snb(void); void intel_pmu_lbr_init_hsw(void); diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 0552884da18d..2f29f4e407c3 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -357,7 +357,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, put_user_ex(ptr_to_compat(&frame->uc), &frame->puc); /* Create the ucontext. */ - if (cpu_has_xsave) + if (boot_cpu_has(X86_FEATURE_XSAVE)) put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); else put_user_ex(0, &frame->uc.uc_flags); diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 99afb665a004..e77a6443104f 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -1,11 +1,12 @@ #ifndef _ASM_X86_ALTERNATIVE_H #define _ASM_X86_ALTERNATIVE_H +#ifndef __ASSEMBLY__ + #include <linux/types.h> #include <linux/stddef.h> #include <linux/stringify.h> #include <asm/asm.h> -#include <asm/ptrace.h> /* * Alternative inline assembly for SMP. @@ -233,36 +234,6 @@ static inline int alternatives_text_reserved(void *start, void *end) */ #define ASM_NO_INPUT_CLOBBER(clbr...) "i" (0) : clbr -struct paravirt_patch_site; -#ifdef CONFIG_PARAVIRT -void apply_paravirt(struct paravirt_patch_site *start, - struct paravirt_patch_site *end); -#else -static inline void apply_paravirt(struct paravirt_patch_site *start, - struct paravirt_patch_site *end) -{} -#define __parainstructions NULL -#define __parainstructions_end NULL -#endif - -extern void *text_poke_early(void *addr, const void *opcode, size_t len); - -/* - * Clear and restore the kernel write-protection flag on the local CPU. - * Allows the kernel to edit read-only pages. - * Side-effect: any interrupt handler running between save and restore will have - * the ability to write to read-only pages. - * - * Warning: - * Code patching in the UP case is safe if NMIs and MCE handlers are stopped and - * no thread can be preempted in the instructions being modified (no iret to an - * invalid instruction possible) or if the instructions are changed from a - * consistent state to another consistent state atomically. - * On the local CPU you need to be protected again NMI or MCE handlers seeing an - * inconsistent instruction while you patch. - */ -extern void *text_poke(void *addr, const void *opcode, size_t len); -extern int poke_int3_handler(struct pt_regs *regs); -extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); +#endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_ALTERNATIVE_H */ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 98f25bbafac4..bc27611fa58f 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -239,10 +239,10 @@ extern void __init check_x2apic(void); extern void x2apic_setup(void); static inline int x2apic_enabled(void) { - return cpu_has_x2apic && apic_is_x2apic_enabled(); + return boot_cpu_has(X86_FEATURE_X2APIC) && apic_is_x2apic_enabled(); } -#define x2apic_supported() (cpu_has_x2apic) +#define x2apic_supported() (boot_cpu_has(X86_FEATURE_X2APIC)) #else /* !CONFIG_X86_X2APIC */ static inline void check_x2apic(void) { } static inline void x2apic_setup(void) { } diff --git a/arch/x86/include/asm/bios_ebda.h b/arch/x86/include/asm/bios_ebda.h index aa6a3170ab5a..2b00c776f223 100644 --- a/arch/x86/include/asm/bios_ebda.h +++ b/arch/x86/include/asm/bios_ebda.h @@ -17,27 +17,6 @@ static inline unsigned int get_bios_ebda(void) return address; /* 0 means none */ } -/* - * Return the sanitized length of the EBDA in bytes, if it exists. - */ -static inline unsigned int get_bios_ebda_length(void) -{ - unsigned int address; - unsigned int length; - - address = get_bios_ebda(); - if (!address) - return 0; - - /* EBDA length is byte 0 of the EBDA (stored in KiB) */ - length = *(unsigned char *)phys_to_virt(address); - length <<= 10; - - /* Trim the length if it extends beyond 640KiB */ - length = min_t(unsigned int, (640 * 1024) - address, length); - return length; -} - void reserve_ebda_region(void); #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 6b8d6e8cd449..abd06b19ddd2 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -12,29 +12,46 @@ /* Minimum kernel alignment, as a power of two */ #ifdef CONFIG_X86_64 -#define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT +# define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT #else -#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_SIZE_ORDER) +# define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_SIZE_ORDER) #endif #define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2) #if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \ (CONFIG_PHYSICAL_ALIGN < MIN_KERNEL_ALIGN) -#error "Invalid value for CONFIG_PHYSICAL_ALIGN" +# error "Invalid value for CONFIG_PHYSICAL_ALIGN" #endif #ifdef CONFIG_KERNEL_BZIP2 -#define BOOT_HEAP_SIZE 0x400000 +# define BOOT_HEAP_SIZE 0x400000 #else /* !CONFIG_KERNEL_BZIP2 */ - -#define BOOT_HEAP_SIZE 0x10000 - -#endif /* !CONFIG_KERNEL_BZIP2 */ +# define BOOT_HEAP_SIZE 0x10000 +#endif #ifdef CONFIG_X86_64 -#define BOOT_STACK_SIZE 0x4000 -#else -#define BOOT_STACK_SIZE 0x1000 +# define BOOT_STACK_SIZE 0x4000 + +# define BOOT_INIT_PGT_SIZE (6*4096) +# ifdef CONFIG_RANDOMIZE_BASE +/* + * Assuming all cross the 512GB boundary: + * 1 page for level4 + * (2+2)*4 pages for kernel, param, cmd_line, and randomized kernel + * 2 pages for first 2M (video RAM: CONFIG_X86_VERBOSE_BOOTUP). + * Total is 19 pages. + */ +# ifdef CONFIG_X86_VERBOSE_BOOTUP +# define BOOT_PGT_SIZE (19*4096) +# else /* !CONFIG_X86_VERBOSE_BOOTUP */ +# define BOOT_PGT_SIZE (17*4096) +# endif +# else /* !CONFIG_RANDOMIZE_BASE */ +# define BOOT_PGT_SIZE BOOT_INIT_PGT_SIZE +# endif + +#else /* !CONFIG_X86_64 */ +# define BOOT_STACK_SIZE 0x1000 #endif #endif /* _ASM_X86_BOOT_H */ diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h index d194266acb28..eae33c7170c8 100644 --- a/arch/x86/include/asm/clocksource.h +++ b/arch/x86/include/asm/clocksource.h @@ -3,11 +3,10 @@ #ifndef _ASM_X86_CLOCKSOURCE_H #define _ASM_X86_CLOCKSOURCE_H -#define VCLOCK_NONE 0 /* No vDSO clock available. */ -#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ -#define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */ -#define VCLOCK_PVCLOCK 3 /* vDSO should use vread_pvclock. */ -#define VCLOCK_MAX 3 +#define VCLOCK_NONE 0 /* No vDSO clock available. */ +#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ +#define VCLOCK_PVCLOCK 2 /* vDSO should use vread_pvclock. */ +#define VCLOCK_MAX 2 struct arch_clocksource_data { int vclock_mode; diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index ebb102e1bbc7..5a3b2c119ed0 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -307,7 +307,7 @@ static inline void __user *arch_compat_alloc_user_space(long len) return (void __user *)round_down(sp - len, 16); } -static inline bool is_x32_task(void) +static inline bool in_x32_syscall(void) { #ifdef CONFIG_X86_X32_ABI if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT) @@ -318,7 +318,7 @@ static inline bool is_x32_task(void) static inline bool in_compat_syscall(void) { - return is_ia32_task() || is_x32_task(); + return in_ia32_syscall() || in_x32_syscall(); } #define in_compat_syscall in_compat_syscall /* override the generic impl */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index aeab47932933..483fb547e3c0 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -27,6 +27,7 @@ enum cpuid_leafs CPUID_6_EAX, CPUID_8000_000A_EDX, CPUID_7_ECX, + CPUID_8000_0007_EBX, }; #ifdef CONFIG_X86_FEATURE_NAMES @@ -118,31 +119,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; set_bit(bit, (unsigned long *)cpu_caps_set); \ } while (0) -#define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU) -#define cpu_has_pse boot_cpu_has(X86_FEATURE_PSE) -#define cpu_has_tsc boot_cpu_has(X86_FEATURE_TSC) -#define cpu_has_pge boot_cpu_has(X86_FEATURE_PGE) -#define cpu_has_apic boot_cpu_has(X86_FEATURE_APIC) -#define cpu_has_fxsr boot_cpu_has(X86_FEATURE_FXSR) -#define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM) -#define cpu_has_xmm2 boot_cpu_has(X86_FEATURE_XMM2) -#define cpu_has_aes boot_cpu_has(X86_FEATURE_AES) -#define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX) -#define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2) -#define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLUSH) -#define cpu_has_gbpages boot_cpu_has(X86_FEATURE_GBPAGES) -#define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON) -#define cpu_has_pat boot_cpu_has(X86_FEATURE_PAT) -#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) -#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) -#define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES) -#define cpu_has_osxsave boot_cpu_has(X86_FEATURE_OSXSAVE) -#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) -/* - * Do not add any more of those clumsy macros - use static_cpu_has() for - * fast paths and boot_cpu_has() otherwise! - */ - #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS) /* * Static testing of CPU features. Used the same as boot_cpu_has(). diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 8f9afefd2dc5..4a413485f9eb 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -12,7 +12,7 @@ /* * Defines x86 CPU feature bits */ -#define NCAPINTS 17 /* N 32-bit words worth of info */ +#define NCAPINTS 18 /* N 32-bit words worth of info */ #define NBUGINTS 1 /* N 32-bit bug flags */ /* @@ -177,6 +177,7 @@ #define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ #define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ #define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ +#define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */ #define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ #define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ @@ -250,6 +251,7 @@ /* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ #define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ +#define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ @@ -280,6 +282,11 @@ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ +/* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ +#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ +#define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */ +#define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */ + /* * BUG word(s) */ @@ -294,6 +301,9 @@ #define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ #define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ #define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ +#define X86_BUG_NULL_SEG X86_BUG(9) /* Nulling a selector preserves the base */ +#define X86_BUG_SWAPGS_FENCE X86_BUG(10) /* SWAPGS without input dep on GS */ + #ifdef CONFIG_X86_32 /* diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 53748c45e488..78d1e7467eae 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -3,6 +3,7 @@ #include <asm/fpu/api.h> #include <asm/pgtable.h> +#include <asm/processor-flags.h> #include <asm/tlb.h> /* @@ -28,33 +29,22 @@ #define MAX_CMDLINE_ADDRESS UINT_MAX -#ifdef CONFIG_X86_32 +#define ARCH_EFI_IRQ_FLAGS_MASK X86_EFLAGS_IF +#ifdef CONFIG_X86_32 extern unsigned long asmlinkage efi_call_phys(void *, ...); +#define arch_efi_call_virt_setup() kernel_fpu_begin() +#define arch_efi_call_virt_teardown() kernel_fpu_end() + /* * Wrap all the virtual calls in a way that forces the parameters on the stack. */ - -/* Use this macro if your virtual returns a non-void value */ -#define efi_call_virt(f, args...) \ +#define arch_efi_call_virt(f, args...) \ ({ \ - efi_status_t __s; \ - kernel_fpu_begin(); \ - __s = ((efi_##f##_t __attribute__((regparm(0)))*) \ - efi.systab->runtime->f)(args); \ - kernel_fpu_end(); \ - __s; \ -}) - -/* Use this macro if your virtual call does not return any value */ -#define __efi_call_virt(f, args...) \ -({ \ - kernel_fpu_begin(); \ ((efi_##f##_t __attribute__((regparm(0)))*) \ efi.systab->runtime->f)(args); \ - kernel_fpu_end(); \ }) #define efi_ioremap(addr, size, type, attr) ioremap_cache(addr, size) @@ -78,10 +68,8 @@ struct efi_scratch { u64 phys_stack; } __packed; -#define efi_call_virt(f, ...) \ +#define arch_efi_call_virt_setup() \ ({ \ - efi_status_t __s; \ - \ efi_sync_low_kernel_mappings(); \ preempt_disable(); \ __kernel_fpu_begin(); \ @@ -91,9 +79,13 @@ struct efi_scratch { write_cr3((unsigned long)efi_scratch.efi_pgt); \ __flush_tlb_all(); \ } \ - \ - __s = efi_call((void *)efi.systab->runtime->f, __VA_ARGS__); \ - \ +}) + +#define arch_efi_call_virt(f, args...) \ + efi_call((void *)efi.systab->runtime->f, args) \ + +#define arch_efi_call_virt_teardown() \ +({ \ if (efi_scratch.use_pgd) { \ write_cr3(efi_scratch.prev_cr3); \ __flush_tlb_all(); \ @@ -101,15 +93,8 @@ struct efi_scratch { \ __kernel_fpu_end(); \ preempt_enable(); \ - __s; \ }) -/* - * All X86_64 virt calls return non-void values. Thus, use non-void call for - * virt calls that would be void on X86_32. - */ -#define __efi_call_virt(f, args...) efi_call_virt(f, args) - extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size, u32 type, u64 attribute); @@ -180,6 +165,8 @@ static inline bool efi_runtime_supported(void) extern struct console early_efi_console; extern void parse_efi_setup(u64 phys_addr, u32 data_len); +extern void efifb_setup_from_dmi(struct screen_info *si, const char *opt); + #ifdef CONFIG_EFI_MIXED extern void efi_thunk_runtime_setup(void); extern efi_status_t efi_thunk_set_virtual_address_map( @@ -225,6 +212,11 @@ __pure const struct efi_config *__efi_early(void); #define efi_call_early(f, ...) \ __efi_early()->call(__efi_early()->f, __VA_ARGS__); +#define __efi_call_early(f, ...) \ + __efi_early()->call((unsigned long)f, __VA_ARGS__); + +#define efi_is_64bit() __efi_early()->is64 + extern bool efi_reboot_required(void); #else diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 15340e36ddcb..fea7724141a0 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -176,7 +176,7 @@ static inline void elf_common_init(struct thread_struct *t, regs->si = regs->di = regs->bp = 0; regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0; regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; - t->fs = t->gs = 0; + t->fsbase = t->gsbase = 0; t->fsindex = t->gsindex = 0; t->ds = t->es = ds; } @@ -226,8 +226,8 @@ do { \ (pr_reg)[18] = (regs)->flags; \ (pr_reg)[19] = (regs)->sp; \ (pr_reg)[20] = (regs)->ss; \ - (pr_reg)[21] = current->thread.fs; \ - (pr_reg)[22] = current->thread.gs; \ + (pr_reg)[21] = current->thread.fsbase; \ + (pr_reg)[22] = current->thread.gsbase; \ asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \ asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \ asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \ diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index e6a8613fbfb0..3a106165e03a 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -4,7 +4,7 @@ #include <asm/page.h> #include <asm-generic/hugetlb.h> -#define hugepages_supported() cpu_has_pse +#define hugepages_supported() boot_cpu_has(X86_FEATURE_PSE) static inline int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h index d0afb05c84fc..f70604125286 100644 --- a/arch/x86/include/asm/irq_work.h +++ b/arch/x86/include/asm/irq_work.h @@ -5,7 +5,7 @@ static inline bool arch_irq_work_has_interrupt(void) { - return cpu_has_apic; + return boot_cpu_has(X86_FEATURE_APIC); } #endif /* _ASM_IRQ_WORK_H */ diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h index 332f98c9111f..22a8537eb780 100644 --- a/arch/x86/include/asm/kgdb.h +++ b/arch/x86/include/asm/kgdb.h @@ -6,6 +6,8 @@ * Copyright (C) 2008 Wind River Systems, Inc. */ +#include <asm/ptrace.h> + /* * BUFMAX defines the maximum number of characters in inbound/outbound * buffers at least NUMREGBYTES*2 are needed for register packets diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 79327e9483a3..0ccb26dda126 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -8,40 +8,6 @@ #ifdef CONFIG_X86_32 #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) - -/* - * Make sure the compiler doesn't do anything stupid with the - * arguments on the stack - they are owned by the *caller*, not - * the callee. This just fools gcc into not spilling into them, - * and keeps it from doing tailcall recursion and/or using the - * stack slots for temporaries, since they are live and "used" - * all the way to the end of the function. - * - * NOTE! On x86-64, all the arguments are in registers, so this - * only matters on a 32-bit kernel. - */ -#define asmlinkage_protect(n, ret, args...) \ - __asmlinkage_protect##n(ret, ##args) -#define __asmlinkage_protect_n(ret, args...) \ - __asm__ __volatile__ ("" : "=r" (ret) : "0" (ret), ##args) -#define __asmlinkage_protect0(ret) \ - __asmlinkage_protect_n(ret) -#define __asmlinkage_protect1(ret, arg1) \ - __asmlinkage_protect_n(ret, "m" (arg1)) -#define __asmlinkage_protect2(ret, arg1, arg2) \ - __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2)) -#define __asmlinkage_protect3(ret, arg1, arg2, arg3) \ - __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3)) -#define __asmlinkage_protect4(ret, arg1, arg2, arg3, arg4) \ - __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3), \ - "m" (arg4)) -#define __asmlinkage_protect5(ret, arg1, arg2, arg3, arg4, arg5) \ - __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3), \ - "m" (arg4), "m" (arg5)) -#define __asmlinkage_protect6(ret, arg1, arg2, arg3, arg4, arg5, arg6) \ - __asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3), \ - "m" (arg4), "m" (arg5), "m" (arg6)) - #endif /* CONFIG_X86_32 */ #ifdef __ASSEMBLY__ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 92b6f651fa4f..8bf766ef0e18 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -104,13 +104,23 @@ #define MCE_LOG_SIGNATURE "MACHINECHECK" /* AMD Scalable MCA */ +#define MSR_AMD64_SMCA_MC0_CTL 0xc0002000 +#define MSR_AMD64_SMCA_MC0_STATUS 0xc0002001 +#define MSR_AMD64_SMCA_MC0_ADDR 0xc0002002 #define MSR_AMD64_SMCA_MC0_MISC0 0xc0002003 #define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004 #define MSR_AMD64_SMCA_MC0_IPID 0xc0002005 +#define MSR_AMD64_SMCA_MC0_DESTAT 0xc0002008 +#define MSR_AMD64_SMCA_MC0_DEADDR 0xc0002009 #define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a +#define MSR_AMD64_SMCA_MCx_CTL(x) (MSR_AMD64_SMCA_MC0_CTL + 0x10*(x)) +#define MSR_AMD64_SMCA_MCx_STATUS(x) (MSR_AMD64_SMCA_MC0_STATUS + 0x10*(x)) +#define MSR_AMD64_SMCA_MCx_ADDR(x) (MSR_AMD64_SMCA_MC0_ADDR + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_MISC(x) (MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_IPID(x) (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x)) +#define MSR_AMD64_SMCA_MCx_DESTAT(x) (MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x)) +#define MSR_AMD64_SMCA_MCx_DEADDR(x) (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x))) /* @@ -168,9 +178,18 @@ struct mce_vendor_flags { __reserved_0 : 61; }; + +struct mca_msr_regs { + u32 (*ctl) (int bank); + u32 (*status) (int bank); + u32 (*addr) (int bank); + u32 (*misc) (int bank); +}; + extern struct mce_vendor_flags mce_flags; extern struct mca_config mca_cfg; +extern struct mca_msr_regs msr_ops; extern void mce_register_decode_chain(struct notifier_block *nb); extern void mce_unregister_decode_chain(struct notifier_block *nb); diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 84280029cafd..396348196aa7 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -115,103 +115,12 @@ static inline void destroy_context(struct mm_struct *mm) destroy_context_ldt(mm); } -static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, - struct task_struct *tsk) -{ - unsigned cpu = smp_processor_id(); +extern void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk); - if (likely(prev != next)) { -#ifdef CONFIG_SMP - this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); - this_cpu_write(cpu_tlbstate.active_mm, next); -#endif - cpumask_set_cpu(cpu, mm_cpumask(next)); - - /* - * Re-load page tables. - * - * This logic has an ordering constraint: - * - * CPU 0: Write to a PTE for 'next' - * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. - * CPU 1: set bit 1 in next's mm_cpumask - * CPU 1: load from the PTE that CPU 0 writes (implicit) - * - * We need to prevent an outcome in which CPU 1 observes - * the new PTE value and CPU 0 observes bit 1 clear in - * mm_cpumask. (If that occurs, then the IPI will never - * be sent, and CPU 0's TLB will contain a stale entry.) - * - * The bad outcome can occur if either CPU's load is - * reordered before that CPU's store, so both CPUs must - * execute full barriers to prevent this from happening. - * - * Thus, switch_mm needs a full barrier between the - * store to mm_cpumask and any operation that could load - * from next->pgd. TLB fills are special and can happen - * due to instruction fetches or for no reason at all, - * and neither LOCK nor MFENCE orders them. - * Fortunately, load_cr3() is serializing and gives the - * ordering guarantee we need. - * - */ - load_cr3(next->pgd); - - trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); - - /* Stop flush ipis for the previous mm */ - cpumask_clear_cpu(cpu, mm_cpumask(prev)); - - /* Load per-mm CR4 state */ - load_mm_cr4(next); - -#ifdef CONFIG_MODIFY_LDT_SYSCALL - /* - * Load the LDT, if the LDT is different. - * - * It's possible that prev->context.ldt doesn't match - * the LDT register. This can happen if leave_mm(prev) - * was called and then modify_ldt changed - * prev->context.ldt but suppressed an IPI to this CPU. - * In this case, prev->context.ldt != NULL, because we - * never set context.ldt to NULL while the mm still - * exists. That means that next->context.ldt != - * prev->context.ldt, because mms never share an LDT. - */ - if (unlikely(prev->context.ldt != next->context.ldt)) - load_mm_ldt(next); -#endif - } -#ifdef CONFIG_SMP - else { - this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); - BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next); - - if (!cpumask_test_cpu(cpu, mm_cpumask(next))) { - /* - * On established mms, the mm_cpumask is only changed - * from irq context, from ptep_clear_flush() while in - * lazy tlb mode, and here. Irqs are blocked during - * schedule, protecting us from simultaneous changes. - */ - cpumask_set_cpu(cpu, mm_cpumask(next)); - - /* - * We were in lazy tlb mode and leave_mm disabled - * tlb flush IPI delivery. We must reload CR3 - * to make sure to use no freed page tables. - * - * As above, load_cr3() is serializing and orders TLB - * fills with respect to the mm_cpumask write. - */ - load_cr3(next->pgd); - trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); - load_mm_cr4(next); - load_mm_ldt(next); - } - } -#endif -} +extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk); +#define switch_mm_irqs_off switch_mm_irqs_off #define activate_mm(prev, next) \ do { \ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 5b3c9a55f51c..5a73a9c62c39 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -89,27 +89,16 @@ #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 #define MSR_IA32_RTIT_CTL 0x00000570 -#define RTIT_CTL_TRACEEN BIT(0) -#define RTIT_CTL_CYCLEACC BIT(1) -#define RTIT_CTL_OS BIT(2) -#define RTIT_CTL_USR BIT(3) -#define RTIT_CTL_CR3EN BIT(7) -#define RTIT_CTL_TOPA BIT(8) -#define RTIT_CTL_MTC_EN BIT(9) -#define RTIT_CTL_TSC_EN BIT(10) -#define RTIT_CTL_DISRETC BIT(11) -#define RTIT_CTL_BRANCH_EN BIT(13) -#define RTIT_CTL_MTC_RANGE_OFFSET 14 -#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET) -#define RTIT_CTL_CYC_THRESH_OFFSET 19 -#define RTIT_CTL_CYC_THRESH (0x0full << RTIT_CTL_CYC_THRESH_OFFSET) -#define RTIT_CTL_PSB_FREQ_OFFSET 24 -#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET) #define MSR_IA32_RTIT_STATUS 0x00000571 -#define RTIT_STATUS_CONTEXTEN BIT(1) -#define RTIT_STATUS_TRIGGEREN BIT(2) -#define RTIT_STATUS_ERROR BIT(4) -#define RTIT_STATUS_STOPPED BIT(5) +#define MSR_IA32_RTIT_STATUS 0x00000571 +#define MSR_IA32_RTIT_ADDR0_A 0x00000580 +#define MSR_IA32_RTIT_ADDR0_B 0x00000581 +#define MSR_IA32_RTIT_ADDR1_A 0x00000582 +#define MSR_IA32_RTIT_ADDR1_B 0x00000583 +#define MSR_IA32_RTIT_ADDR2_A 0x00000584 +#define MSR_IA32_RTIT_ADDR2_B 0x00000585 +#define MSR_IA32_RTIT_ADDR3_A 0x00000586 +#define MSR_IA32_RTIT_ADDR3_B 0x00000587 #define MSR_IA32_RTIT_CR3_MATCH 0x00000572 #define MSR_IA32_RTIT_OUTPUT_BASE 0x00000560 #define MSR_IA32_RTIT_OUTPUT_MASK 0x00000561 @@ -205,6 +194,8 @@ #define MSR_CONFIG_TDP_CONTROL 0x0000064B #define MSR_TURBO_ACTIVATION_RATIO 0x0000064C +#define MSR_PLATFORM_ENERGY_STATUS 0x0000064D + #define MSR_PKG_WEIGHTED_CORE_C0_RES 0x00000658 #define MSR_PKG_ANY_CORE_C0_RES 0x00000659 #define MSR_PKG_ANY_GFXE_C0_RES 0x0000065A @@ -315,6 +306,9 @@ #define MSR_AMD64_IBSOPDATA4 0xc001103d #define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */ +/* Fam 17h MSRs */ +#define MSR_F17H_IRPERF 0xc00000e9 + /* Fam 16h MSRs */ #define MSR_F16H_L2I_PERF_CTL 0xc0010230 #define MSR_F16H_L2I_PERF_CTR 0xc0010231 @@ -328,6 +322,7 @@ #define MSR_F15H_PERF_CTR 0xc0010201 #define MSR_F15H_NB_PERF_CTL 0xc0010240 #define MSR_F15H_NB_PERF_CTR 0xc0010241 +#define MSR_F15H_PTSC 0xc0010280 #define MSR_F15H_IC_CFG 0xc0011021 /* Fam 10h MSRs */ diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 7a79ee2778b3..7dc1d8fef7fd 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -84,7 +84,10 @@ static inline unsigned long long native_read_msr(unsigned int msr) { DECLARE_ARGS(val, low, high); - asm volatile("rdmsr" : EAX_EDX_RET(val, low, high) : "c" (msr)); + asm volatile("1: rdmsr\n" + "2:\n" + _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_unsafe) + : EAX_EDX_RET(val, low, high) : "c" (msr)); if (msr_tracepoint_active(__tracepoint_read_msr)) do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), 0); return EAX_EDX_VAL(val, low, high); @@ -98,7 +101,10 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr, asm volatile("2: rdmsr ; xor %[err],%[err]\n" "1:\n\t" ".section .fixup,\"ax\"\n\t" - "3: mov %[fault],%[err] ; jmp 1b\n\t" + "3: mov %[fault],%[err]\n\t" + "xorl %%eax, %%eax\n\t" + "xorl %%edx, %%edx\n\t" + "jmp 1b\n\t" ".previous\n\t" _ASM_EXTABLE(2b, 3b) : [err] "=r" (*err), EAX_EDX_RET(val, low, high) @@ -108,10 +114,14 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr, return EAX_EDX_VAL(val, low, high); } -static inline void native_write_msr(unsigned int msr, - unsigned low, unsigned high) +/* Can be uninlined because referenced by paravirt */ +notrace static inline void native_write_msr(unsigned int msr, + unsigned low, unsigned high) { - asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory"); + asm volatile("1: wrmsr\n" + "2:\n" + _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe) + : : "c" (msr), "a"(low), "d" (high) : "memory"); if (msr_tracepoint_active(__tracepoint_read_msr)) do_trace_write_msr(msr, ((u64)high << 32 | low), 0); } diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index b94f6f64e23d..dbff1456d215 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -24,6 +24,7 @@ #define _ASM_X86_MTRR_H #include <uapi/asm/mtrr.h> +#include <asm/pat.h> /* @@ -83,9 +84,12 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn) static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) { } +static inline void mtrr_bp_init(void) +{ + pat_disable("MTRRs disabled, skipping PAT initialization too."); +} #define mtrr_ap_init() do {} while (0) -#define mtrr_bp_init() do {} while (0) #define set_mtrr_aps_delayed_init() do {} while (0) #define mtrr_aps_init() do {} while (0) #define mtrr_bp_restore() do {} while (0) diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 802dde30c928..cf8f619b305f 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -37,7 +37,10 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr) #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE +#ifndef __pa #define __pa(x) __phys_addr((unsigned long)(x)) +#endif + #define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x)) /* __pa_symbol should be used for C visible symbols. This seems to be the official gcc blessed way to do such arithmetic. */ @@ -51,7 +54,9 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, #define __pa_symbol(x) \ __phys_addr_symbol(__phys_reloc_hide((unsigned long)(x))) +#ifndef __va #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) +#endif #define __boot_va(x) __va(x) #define __boot_pa(x) __pa(x) diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 4928cf0d5af0..d5c2f8b40faa 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -47,12 +47,10 @@ * are fully set up. If kernel ASLR is configured, it can extend the * kernel page table mapping, reducing the size of the modules area. */ -#define KERNEL_IMAGE_SIZE_DEFAULT (512 * 1024 * 1024) -#if defined(CONFIG_RANDOMIZE_BASE) && \ - CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE_DEFAULT -#define KERNEL_IMAGE_SIZE CONFIG_RANDOMIZE_BASE_MAX_OFFSET +#if defined(CONFIG_RANDOMIZE_BASE) +#define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024) #else -#define KERNEL_IMAGE_SIZE KERNEL_IMAGE_SIZE_DEFAULT +#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) #endif #endif /* _ASM_X86_PAGE_64_DEFS_H */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 601f1b8f9961..2970d22d7766 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -15,17 +15,6 @@ #include <linux/cpumask.h> #include <asm/frame.h> -static inline int paravirt_enabled(void) -{ - return pv_info.paravirt_enabled; -} - -static inline int paravirt_has_feature(unsigned int feature) -{ - WARN_ON_ONCE(!pv_info.paravirt_enabled); - return (pv_info.features & feature); -} - static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread) { @@ -130,21 +119,31 @@ static inline void wbinvd(void) #define get_kernel_rpl() (pv_info.kernel_rpl) -static inline u64 paravirt_read_msr(unsigned msr, int *err) +static inline u64 paravirt_read_msr(unsigned msr) +{ + return PVOP_CALL1(u64, pv_cpu_ops.read_msr, msr); +} + +static inline void paravirt_write_msr(unsigned msr, + unsigned low, unsigned high) +{ + return PVOP_VCALL3(pv_cpu_ops.write_msr, msr, low, high); +} + +static inline u64 paravirt_read_msr_safe(unsigned msr, int *err) { - return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err); + return PVOP_CALL2(u64, pv_cpu_ops.read_msr_safe, msr, err); } -static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high) +static inline int paravirt_write_msr_safe(unsigned msr, + unsigned low, unsigned high) { - return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high); + return PVOP_CALL3(int, pv_cpu_ops.write_msr_safe, msr, low, high); } -/* These should all do BUG_ON(_err), but our headers are too tangled. */ #define rdmsr(msr, val1, val2) \ do { \ - int _err; \ - u64 _l = paravirt_read_msr(msr, &_err); \ + u64 _l = paravirt_read_msr(msr); \ val1 = (u32)_l; \ val2 = _l >> 32; \ } while (0) @@ -156,8 +155,7 @@ do { \ #define rdmsrl(msr, val) \ do { \ - int _err; \ - val = paravirt_read_msr(msr, &_err); \ + val = paravirt_read_msr(msr); \ } while (0) static inline void wrmsrl(unsigned msr, u64 val) @@ -165,23 +163,23 @@ static inline void wrmsrl(unsigned msr, u64 val) wrmsr(msr, (u32)val, (u32)(val>>32)); } -#define wrmsr_safe(msr, a, b) paravirt_write_msr(msr, a, b) +#define wrmsr_safe(msr, a, b) paravirt_write_msr_safe(msr, a, b) /* rdmsr with exception handling */ -#define rdmsr_safe(msr, a, b) \ -({ \ - int _err; \ - u64 _l = paravirt_read_msr(msr, &_err); \ - (*a) = (u32)_l; \ - (*b) = _l >> 32; \ - _err; \ +#define rdmsr_safe(msr, a, b) \ +({ \ + int _err; \ + u64 _l = paravirt_read_msr_safe(msr, &_err); \ + (*a) = (u32)_l; \ + (*b) = _l >> 32; \ + _err; \ }) static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) { int err; - *p = paravirt_read_msr(msr, &err); + *p = paravirt_read_msr_safe(msr, &err); return err; } diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index e8c2326478c8..7fa9e7740ba3 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -69,15 +69,9 @@ struct pv_info { u16 extra_user_64bit_cs; /* __USER_CS if none */ #endif - int paravirt_enabled; - unsigned int features; /* valid only if paravirt_enabled is set */ const char *name; }; -#define paravirt_has(x) paravirt_has_feature(PV_SUPPORTED_##x) -/* Supported features */ -#define PV_SUPPORTED_RTC (1<<0) - struct pv_init_ops { /* * Patch may replace one of the defined code sequences with @@ -155,10 +149,16 @@ struct pv_cpu_ops { void (*cpuid)(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx); - /* MSR, PMC and TSR operations. - err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ - u64 (*read_msr)(unsigned int msr, int *err); - int (*write_msr)(unsigned int msr, unsigned low, unsigned high); + /* Unsafe MSR operations. These will warn or panic on failure. */ + u64 (*read_msr)(unsigned int msr); + void (*write_msr)(unsigned int msr, unsigned low, unsigned high); + + /* + * Safe MSR operations. + * read sets err to 0 or -EIO. write returns 0 or -EIO. + */ + u64 (*read_msr_safe)(unsigned int msr, int *err); + int (*write_msr_safe)(unsigned int msr, unsigned low, unsigned high); u64 (*read_pmc)(int counter); diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index ca6c228d5e62..0b1ff4c1c14e 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -5,8 +5,8 @@ #include <asm/pgtable_types.h> bool pat_enabled(void); +void pat_disable(const char *reason); extern void pat_init(void); -void pat_init_cache_modes(u64); extern int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 97f3242e133c..f86491a7bc9d 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -183,7 +183,7 @@ static inline int pmd_trans_huge(pmd_t pmd) static inline int has_transparent_hugepage(void) { - return cpu_has_pse; + return boot_cpu_has(X86_FEATURE_PSE); } #ifdef __HAVE_ARCH_PTE_DEVMAP diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 9264476f3d57..62c6cc3cc5d3 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -388,9 +388,16 @@ struct thread_struct { unsigned long ip; #endif #ifdef CONFIG_X86_64 - unsigned long fs; + unsigned long fsbase; + unsigned long gsbase; +#else + /* + * XXX: this could presumably be unsigned short. Alternatively, + * 32-bit kernels could be taught to use fsindex instead. + */ + unsigned long fs; + unsigned long gs; #endif - unsigned long gs; /* Save middle states of ptrace breakpoints */ struct perf_event *ptrace_bps[HBP_NUM]; @@ -473,8 +480,6 @@ static inline unsigned long current_top_of_stack(void) #include <asm/paravirt.h> #else #define __cpuid native_cpuid -#define paravirt_enabled() 0 -#define paravirt_has(x) 0 static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread) diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index ceec86eb68e9..453744c1d347 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -99,26 +99,36 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +#define ____down_write(sem, slow_path) \ +({ \ + long tmp; \ + struct rw_semaphore* ret; \ + asm volatile("# beginning down_write\n\t" \ + LOCK_PREFIX " xadd %1,(%3)\n\t" \ + /* adds 0xffff0001, returns the old value */ \ + " test " __ASM_SEL(%w1,%k1) "," __ASM_SEL(%w1,%k1) "\n\t" \ + /* was the active mask 0 before? */\ + " jz 1f\n" \ + " call " slow_path "\n" \ + "1:\n" \ + "# ending down_write" \ + : "+m" (sem->count), "=d" (tmp), "=a" (ret) \ + : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS) \ + : "memory", "cc"); \ + ret; \ +}) + +static inline void __down_write(struct rw_semaphore *sem) { - long tmp; - asm volatile("# beginning down_write\n\t" - LOCK_PREFIX " xadd %1,(%2)\n\t" - /* adds 0xffff0001, returns the old value */ - " test " __ASM_SEL(%w1,%k1) "," __ASM_SEL(%w1,%k1) "\n\t" - /* was the active mask 0 before? */ - " jz 1f\n" - " call call_rwsem_down_write_failed\n" - "1:\n" - "# ending down_write" - : "+m" (sem->count), "=d" (tmp) - : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS) - : "memory", "cc"); + ____down_write(sem, "call_rwsem_down_write_failed"); } -static inline void __down_write(struct rw_semaphore *sem) +static inline int __down_write_killable(struct rw_semaphore *sem) { - __down_write_nested(sem, 0); + if (IS_ERR(____down_write(sem, "call_rwsem_down_write_failed_killable"))) + return -EINTR; + + return 0; } /* diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 7d5a1929d76b..1549caa098f0 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -2,6 +2,7 @@ #define _ASM_X86_SEGMENT_H #include <linux/const.h> +#include <asm/alternative.h> /* * Constructor for a conventional segment GDT (or LDT) entry. @@ -207,13 +208,6 @@ #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) #define __PER_CPU_SEG (GDT_ENTRY_PER_CPU*8 + 3) -/* TLS indexes for 64-bit - hardcoded in arch_prctl(): */ -#define FS_TLS 0 -#define GS_TLS 1 - -#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3) -#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3) - #endif #ifndef CONFIG_PARAVIRT @@ -249,10 +243,13 @@ extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDL #endif /* - * Load a segment. Fall back on loading the zero - * segment if something goes wrong.. + * Load a segment. Fall back on loading the zero segment if something goes + * wrong. This variant assumes that loading zero fully clears the segment. + * This is always the case on Intel CPUs and, even on 64-bit AMD CPUs, any + * failure to fully clear the cached descriptor is only observable for + * FS and GS. */ -#define loadsegment(seg, value) \ +#define __loadsegment_simple(seg, value) \ do { \ unsigned short __val = (value); \ \ @@ -269,6 +266,38 @@ do { \ : "+r" (__val) : : "memory"); \ } while (0) +#define __loadsegment_ss(value) __loadsegment_simple(ss, (value)) +#define __loadsegment_ds(value) __loadsegment_simple(ds, (value)) +#define __loadsegment_es(value) __loadsegment_simple(es, (value)) + +#ifdef CONFIG_X86_32 + +/* + * On 32-bit systems, the hidden parts of FS and GS are unobservable if + * the selector is NULL, so there's no funny business here. + */ +#define __loadsegment_fs(value) __loadsegment_simple(fs, (value)) +#define __loadsegment_gs(value) __loadsegment_simple(gs, (value)) + +#else + +static inline void __loadsegment_fs(unsigned short value) +{ + asm volatile(" \n" + "1: movw %0, %%fs \n" + "2: \n" + + _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_clear_fs) + + : : "rm" (value) : "memory"); +} + +/* __loadsegment_gs is intentionally undefined. Use load_gs_index instead. */ + +#endif + +#define loadsegment(seg, value) __loadsegment_ ## seg (value) + /* * Save a segment register away: */ diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 11af24e09c8a..ac1d5da14734 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -6,6 +6,7 @@ #define COMMAND_LINE_SIZE 2048 #include <linux/linkage.h> +#include <asm/page_types.h> #ifdef __i386__ diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 751bf4b7bf11..8f321a1b03a1 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -39,8 +39,7 @@ do { \ */ \ unsigned long ebx, ecx, edx, esi, edi; \ \ - asm volatile("pushfl\n\t" /* save flags */ \ - "pushl %%ebp\n\t" /* save EBP */ \ + asm volatile("pushl %%ebp\n\t" /* save EBP */ \ "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ "movl $1f,%[prev_ip]\n\t" /* save EIP */ \ @@ -49,7 +48,6 @@ do { \ "jmp __switch_to\n" /* regparm call */ \ "1:\t" \ "popl %%ebp\n\t" /* restore EBP */ \ - "popfl\n" /* restore flags */ \ \ /* output parameters */ \ : [prev_sp] "=m" (prev->thread.sp), \ diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h new file mode 100644 index 000000000000..90395063383c --- /dev/null +++ b/arch/x86/include/asm/text-patching.h @@ -0,0 +1,40 @@ +#ifndef _ASM_X86_TEXT_PATCHING_H +#define _ASM_X86_TEXT_PATCHING_H + +#include <linux/types.h> +#include <linux/stddef.h> +#include <asm/ptrace.h> + +struct paravirt_patch_site; +#ifdef CONFIG_PARAVIRT +void apply_paravirt(struct paravirt_patch_site *start, + struct paravirt_patch_site *end); +#else +static inline void apply_paravirt(struct paravirt_patch_site *start, + struct paravirt_patch_site *end) +{} +#define __parainstructions NULL +#define __parainstructions_end NULL +#endif + +extern void *text_poke_early(void *addr, const void *opcode, size_t len); + +/* + * Clear and restore the kernel write-protection flag on the local CPU. + * Allows the kernel to edit read-only pages. + * Side-effect: any interrupt handler running between save and restore will have + * the ability to write to read-only pages. + * + * Warning: + * Code patching in the UP case is safe if NMIs and MCE handlers are stopped and + * no thread can be preempted in the instructions being modified (no iret to an + * invalid instruction possible) or if the instructions are changed from a + * consistent state to another consistent state atomically. + * On the local CPU you need to be protected again NMI or MCE handlers seeing an + * inconsistent instruction while you patch. + */ +extern void *text_poke(void *addr, const void *opcode, size_t len); +extern int poke_int3_handler(struct pt_regs *regs); +extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); + +#endif /* _ASM_X86_TEXT_PATCHING_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index ffae84df8a93..30c133ac05cd 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -255,7 +255,7 @@ static inline bool test_and_clear_restore_sigmask(void) return true; } -static inline bool is_ia32_task(void) +static inline bool in_ia32_syscall(void) { #ifdef CONFIG_X86_32 return true; diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 1fde8d580a5b..4e5be94e079a 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -181,7 +181,7 @@ static inline void __native_flush_tlb_single(unsigned long addr) static inline void __flush_tlb_all(void) { - if (cpu_has_pge) + if (static_cpu_has(X86_FEATURE_PGE)) __flush_tlb_global(); else __flush_tlb(); diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 174c4212780a..7428697c5b8d 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -22,7 +22,7 @@ extern void disable_TSC(void); static inline cycles_t get_cycles(void) { #ifndef CONFIG_X86_TSC - if (!cpu_has_tsc) + if (!boot_cpu_has(X86_FEATURE_TSC)) return 0; #endif diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 2e7513d1f1f4..12f9653bde8d 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -118,7 +118,7 @@ struct exception_table_entry { extern int fixup_exception(struct pt_regs *regs, int trapnr); extern bool ex_has_fault_handler(unsigned long ip); -extern int early_fixup_exception(unsigned long *ip); +extern void early_fixup_exception(struct pt_regs *regs, int trapnr); /* * These are the main single-value transfer routines. They automatically diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 71605c7d5c5c..c852590254d5 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -51,15 +51,66 @@ enum { BIOS_STATUS_UNAVAIL = -EBUSY }; +/* Address map parameters */ +struct uv_gam_parameters { + u64 mmr_base; + u64 gru_base; + u8 mmr_shift; /* Convert PNode to MMR space offset */ + u8 gru_shift; /* Convert PNode to GRU space offset */ + u8 gpa_shift; /* Size of offset field in GRU phys addr */ + u8 unused1; +}; + +/* UV_TABLE_GAM_RANGE_ENTRY values */ +#define UV_GAM_RANGE_TYPE_UNUSED 0 /* End of table */ +#define UV_GAM_RANGE_TYPE_RAM 1 /* Normal RAM */ +#define UV_GAM_RANGE_TYPE_NVRAM 2 /* Non-volatile memory */ +#define UV_GAM_RANGE_TYPE_NV_WINDOW 3 /* NVMDIMM block window */ +#define UV_GAM_RANGE_TYPE_NV_MAILBOX 4 /* NVMDIMM mailbox */ +#define UV_GAM_RANGE_TYPE_HOLE 5 /* Unused address range */ +#define UV_GAM_RANGE_TYPE_MAX 6 + +/* The structure stores PA bits 56:26, for 64MB granularity */ +#define UV_GAM_RANGE_SHFT 26 /* 64MB */ + +struct uv_gam_range_entry { + char type; /* Entry type: GAM_RANGE_TYPE_UNUSED, etc. */ + char unused1; + u16 nasid; /* HNasid */ + u16 sockid; /* Socket ID, high bits of APIC ID */ + u16 pnode; /* Index to MMR and GRU spaces */ + u32 pxm; /* ACPI proximity domain number */ + u32 limit; /* PA bits 56:26 (UV_GAM_RANGE_SHFT) */ +}; + +#define UV_SYSTAB_SIG "UVST" +#define UV_SYSTAB_VERSION_1 1 /* UV1/2/3 BIOS version */ +#define UV_SYSTAB_VERSION_UV4 0x400 /* UV4 BIOS base version */ +#define UV_SYSTAB_VERSION_UV4_1 0x401 /* + gpa_shift */ +#define UV_SYSTAB_VERSION_UV4_2 0x402 /* + TYPE_NVRAM/WINDOW/MBOX */ +#define UV_SYSTAB_VERSION_UV4_LATEST UV_SYSTAB_VERSION_UV4_2 + +#define UV_SYSTAB_TYPE_UNUSED 0 /* End of table (offset == 0) */ +#define UV_SYSTAB_TYPE_GAM_PARAMS 1 /* GAM PARAM conversions */ +#define UV_SYSTAB_TYPE_GAM_RNG_TBL 2 /* GAM entry table */ +#define UV_SYSTAB_TYPE_MAX 3 + /* * The UV system table describes specific firmware * capabilities available to the Linux kernel at runtime. */ struct uv_systab { - char signature[4]; /* must be "UVST" */ + char signature[4]; /* must be UV_SYSTAB_SIG */ u32 revision; /* distinguish different firmware revs */ u64 function; /* BIOS runtime callback function ptr */ + u32 size; /* systab size (starting with _VERSION_UV4) */ + struct { + u32 type:8; /* type of entry */ + u32 offset:24; /* byte offset from struct start to entry */ + } entry[1]; /* additional entries follow */ }; +extern struct uv_systab *uv_systab; +/* (... end of definitions from UV BIOS ...) */ enum { BIOS_FREQ_BASE_PLATFORM = 0, @@ -99,7 +150,11 @@ extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *); extern int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus); +#ifdef CONFIG_EFI extern void uv_bios_init(void); +#else +void uv_bios_init(void) { } +#endif extern unsigned long sn_rtc_cycles_per_second; extern int uv_type; @@ -107,7 +162,7 @@ extern long sn_partition_id; extern long sn_coherency_id; extern long sn_region_size; extern long system_serial_number; -#define partition_coherence_id() (sn_coherency_id) +#define uv_partition_coherence_id() (sn_coherency_id) extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */ diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index fc808b83fccb..cc44d926c17e 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -598,7 +598,7 @@ struct bau_control { int timeout_tries; int ipi_attempts; int conseccompletes; - short nobau; + bool nobau; short baudisabled; short cpu; short osnode; diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index ea7074784cc4..097b80c989c4 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -16,9 +16,11 @@ #include <linux/percpu.h> #include <linux/timer.h> #include <linux/io.h> +#include <linux/topology.h> #include <asm/types.h> #include <asm/percpu.h> #include <asm/uv/uv_mmrs.h> +#include <asm/uv/bios.h> #include <asm/irq_vectors.h> #include <asm/io_apic.h> @@ -103,7 +105,6 @@ * processor APICID register. */ - /* * Maximum number of bricks in all partitions and in all coherency domains. * This is the total number of bricks accessible in the numalink fabric. It @@ -127,6 +128,7 @@ */ #define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_BLADES * 2) +/* System Controller Interface Reg info */ struct uv_scir_s { struct timer_list timer; unsigned long offset; @@ -137,71 +139,173 @@ struct uv_scir_s { unsigned char enabled; }; +/* GAM (globally addressed memory) range table */ +struct uv_gam_range_s { + u32 limit; /* PA bits 56:26 (GAM_RANGE_SHFT) */ + u16 nasid; /* node's global physical address */ + s8 base; /* entry index of node's base addr */ + u8 reserved; +}; + /* * The following defines attributes of the HUB chip. These attributes are - * frequently referenced and are kept in the per-cpu data areas of each cpu. - * They are kept together in a struct to minimize cache misses. + * frequently referenced and are kept in a common per hub struct. + * After setup, the struct is read only, so it should be readily + * available in the L3 cache on the cpu socket for the node. */ struct uv_hub_info_s { unsigned long global_mmr_base; + unsigned long global_mmr_shift; unsigned long gpa_mask; - unsigned int gnode_extra; + unsigned short *socket_to_node; + unsigned short *socket_to_pnode; + unsigned short *pnode_to_socket; + struct uv_gam_range_s *gr_table; + unsigned short min_socket; + unsigned short min_pnode; + unsigned char m_val; + unsigned char n_val; + unsigned char gr_table_len; unsigned char hub_revision; unsigned char apic_pnode_shift; + unsigned char gpa_shift; unsigned char m_shift; unsigned char n_lshift; + unsigned int gnode_extra; unsigned long gnode_upper; unsigned long lowmem_remap_top; unsigned long lowmem_remap_base; + unsigned long global_gru_base; + unsigned long global_gru_shift; unsigned short pnode; unsigned short pnode_mask; unsigned short coherency_domain_number; unsigned short numa_blade_id; - unsigned char blade_processor_id; - unsigned char m_val; - unsigned char n_val; + unsigned short nr_possible_cpus; + unsigned short nr_online_cpus; + short memory_nid; +}; + +/* CPU specific info with a pointer to the hub common info struct */ +struct uv_cpu_info_s { + void *p_uv_hub_info; + unsigned char blade_cpu_id; struct uv_scir_s scir; }; +DECLARE_PER_CPU(struct uv_cpu_info_s, __uv_cpu_info); + +#define uv_cpu_info this_cpu_ptr(&__uv_cpu_info) +#define uv_cpu_info_per(cpu) (&per_cpu(__uv_cpu_info, cpu)) + +#define uv_scir_info (&uv_cpu_info->scir) +#define uv_cpu_scir_info(cpu) (&uv_cpu_info_per(cpu)->scir) + +/* Node specific hub common info struct */ +extern void **__uv_hub_info_list; +static inline struct uv_hub_info_s *uv_hub_info_list(int node) +{ + return (struct uv_hub_info_s *)__uv_hub_info_list[node]; +} + +static inline struct uv_hub_info_s *_uv_hub_info(void) +{ + return (struct uv_hub_info_s *)uv_cpu_info->p_uv_hub_info; +} +#define uv_hub_info _uv_hub_info() -DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); -#define uv_hub_info this_cpu_ptr(&__uv_hub_info) -#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu)) +static inline struct uv_hub_info_s *uv_cpu_hub_info(int cpu) +{ + return (struct uv_hub_info_s *)uv_cpu_info_per(cpu)->p_uv_hub_info; +} + +#define UV_HUB_INFO_VERSION 0x7150 +extern int uv_hub_info_version(void); +static inline int uv_hub_info_check(int version) +{ + if (uv_hub_info_version() == version) + return 0; + + pr_crit("UV: uv_hub_info version(%x) mismatch, expecting(%x)\n", + uv_hub_info_version(), version); + + BUG(); /* Catastrophic - cannot continue on unknown UV system */ +} +#define _uv_hub_info_check() uv_hub_info_check(UV_HUB_INFO_VERSION) /* - * Hub revisions less than UV2_HUB_REVISION_BASE are UV1 hubs. All UV2 - * hubs have revision numbers greater than or equal to UV2_HUB_REVISION_BASE. + * HUB revision ranges for each UV HUB architecture. * This is a software convention - NOT the hardware revision numbers in * the hub chip. */ #define UV1_HUB_REVISION_BASE 1 #define UV2_HUB_REVISION_BASE 3 #define UV3_HUB_REVISION_BASE 5 +#define UV4_HUB_REVISION_BASE 7 +#ifdef UV1_HUB_IS_SUPPORTED static inline int is_uv1_hub(void) { return uv_hub_info->hub_revision < UV2_HUB_REVISION_BASE; } +#else +static inline int is_uv1_hub(void) +{ + return 0; +} +#endif +#ifdef UV2_HUB_IS_SUPPORTED static inline int is_uv2_hub(void) { return ((uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE) && (uv_hub_info->hub_revision < UV3_HUB_REVISION_BASE)); } +#else +static inline int is_uv2_hub(void) +{ + return 0; +} +#endif +#ifdef UV3_HUB_IS_SUPPORTED +static inline int is_uv3_hub(void) +{ + return ((uv_hub_info->hub_revision >= UV3_HUB_REVISION_BASE) && + (uv_hub_info->hub_revision < UV4_HUB_REVISION_BASE)); +} +#else static inline int is_uv3_hub(void) { - return uv_hub_info->hub_revision >= UV3_HUB_REVISION_BASE; + return 0; } +#endif -static inline int is_uv_hub(void) +#ifdef UV4_HUB_IS_SUPPORTED +static inline int is_uv4_hub(void) { - return uv_hub_info->hub_revision; + return uv_hub_info->hub_revision >= UV4_HUB_REVISION_BASE; } +#else +static inline int is_uv4_hub(void) +{ + return 0; +} +#endif -/* code common to uv2 and uv3 only */ static inline int is_uvx_hub(void) { - return uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE; + if (uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE) + return uv_hub_info->hub_revision; + + return 0; +} + +static inline int is_uv_hub(void) +{ +#ifdef UV1_HUB_IS_SUPPORTED + return uv_hub_info->hub_revision; +#endif + return is_uvx_hub(); } union uvh_apicid { @@ -243,24 +347,42 @@ union uvh_apicid { #define UV3_LOCAL_MMR_SIZE (32UL * 1024 * 1024) #define UV3_GLOBAL_MMR32_SIZE (32UL * 1024 * 1024) -#define UV_LOCAL_MMR_BASE (is_uv1_hub() ? UV1_LOCAL_MMR_BASE : \ - (is_uv2_hub() ? UV2_LOCAL_MMR_BASE : \ - UV3_LOCAL_MMR_BASE)) -#define UV_GLOBAL_MMR32_BASE (is_uv1_hub() ? UV1_GLOBAL_MMR32_BASE :\ - (is_uv2_hub() ? UV2_GLOBAL_MMR32_BASE :\ - UV3_GLOBAL_MMR32_BASE)) -#define UV_LOCAL_MMR_SIZE (is_uv1_hub() ? UV1_LOCAL_MMR_SIZE : \ - (is_uv2_hub() ? UV2_LOCAL_MMR_SIZE : \ - UV3_LOCAL_MMR_SIZE)) -#define UV_GLOBAL_MMR32_SIZE (is_uv1_hub() ? UV1_GLOBAL_MMR32_SIZE :\ - (is_uv2_hub() ? UV2_GLOBAL_MMR32_SIZE :\ - UV3_GLOBAL_MMR32_SIZE)) +#define UV4_LOCAL_MMR_BASE 0xfa000000UL +#define UV4_GLOBAL_MMR32_BASE 0xfc000000UL +#define UV4_LOCAL_MMR_SIZE (32UL * 1024 * 1024) +#define UV4_GLOBAL_MMR32_SIZE (16UL * 1024 * 1024) + +#define UV_LOCAL_MMR_BASE ( \ + is_uv1_hub() ? UV1_LOCAL_MMR_BASE : \ + is_uv2_hub() ? UV2_LOCAL_MMR_BASE : \ + is_uv3_hub() ? UV3_LOCAL_MMR_BASE : \ + /*is_uv4_hub*/ UV4_LOCAL_MMR_BASE) + +#define UV_GLOBAL_MMR32_BASE ( \ + is_uv1_hub() ? UV1_GLOBAL_MMR32_BASE : \ + is_uv2_hub() ? UV2_GLOBAL_MMR32_BASE : \ + is_uv3_hub() ? UV3_GLOBAL_MMR32_BASE : \ + /*is_uv4_hub*/ UV4_GLOBAL_MMR32_BASE) + +#define UV_LOCAL_MMR_SIZE ( \ + is_uv1_hub() ? UV1_LOCAL_MMR_SIZE : \ + is_uv2_hub() ? UV2_LOCAL_MMR_SIZE : \ + is_uv3_hub() ? UV3_LOCAL_MMR_SIZE : \ + /*is_uv4_hub*/ UV4_LOCAL_MMR_SIZE) + +#define UV_GLOBAL_MMR32_SIZE ( \ + is_uv1_hub() ? UV1_GLOBAL_MMR32_SIZE : \ + is_uv2_hub() ? UV2_GLOBAL_MMR32_SIZE : \ + is_uv3_hub() ? UV3_GLOBAL_MMR32_SIZE : \ + /*is_uv4_hub*/ UV4_GLOBAL_MMR32_SIZE) + #define UV_GLOBAL_MMR64_BASE (uv_hub_info->global_mmr_base) #define UV_GLOBAL_GRU_MMR_BASE 0x4000000 #define UV_GLOBAL_MMR32_PNODE_SHIFT 15 -#define UV_GLOBAL_MMR64_PNODE_SHIFT 26 +#define _UV_GLOBAL_MMR64_PNODE_SHIFT 26 +#define UV_GLOBAL_MMR64_PNODE_SHIFT (uv_hub_info->global_mmr_shift) #define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT)) @@ -307,18 +429,74 @@ union uvh_apicid { * between socket virtual and socket physical addresses. */ +/* global bits offset - number of local address bits in gpa for this UV arch */ +static inline unsigned int uv_gpa_shift(void) +{ + return uv_hub_info->gpa_shift; +} +#define _uv_gpa_shift + +/* Find node that has the address range that contains global address */ +static inline struct uv_gam_range_s *uv_gam_range(unsigned long pa) +{ + struct uv_gam_range_s *gr = uv_hub_info->gr_table; + unsigned long pal = (pa & uv_hub_info->gpa_mask) >> UV_GAM_RANGE_SHFT; + int i, num = uv_hub_info->gr_table_len; + + if (gr) { + for (i = 0; i < num; i++, gr++) { + if (pal < gr->limit) + return gr; + } + } + pr_crit("UV: GAM Range for 0x%lx not found at %p!\n", pa, gr); + BUG(); +} + +/* Return base address of node that contains global address */ +static inline unsigned long uv_gam_range_base(unsigned long pa) +{ + struct uv_gam_range_s *gr = uv_gam_range(pa); + int base = gr->base; + + if (base < 0) + return 0UL; + + return uv_hub_info->gr_table[base].limit; +} + +/* socket phys RAM --> UV global NASID (UV4+) */ +static inline unsigned long uv_soc_phys_ram_to_nasid(unsigned long paddr) +{ + return uv_gam_range(paddr)->nasid; +} +#define _uv_soc_phys_ram_to_nasid + +/* socket virtual --> UV global NASID (UV4+) */ +static inline unsigned long uv_gpa_nasid(void *v) +{ + return uv_soc_phys_ram_to_nasid(__pa(v)); +} + /* socket phys RAM --> UV global physical address */ static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr) { + unsigned int m_val = uv_hub_info->m_val; + if (paddr < uv_hub_info->lowmem_remap_top) paddr |= uv_hub_info->lowmem_remap_base; paddr |= uv_hub_info->gnode_upper; - paddr = ((paddr << uv_hub_info->m_shift) >> uv_hub_info->m_shift) | - ((paddr >> uv_hub_info->m_val) << uv_hub_info->n_lshift); + if (m_val) + paddr = ((paddr << uv_hub_info->m_shift) + >> uv_hub_info->m_shift) | + ((paddr >> uv_hub_info->m_val) + << uv_hub_info->n_lshift); + else + paddr |= uv_soc_phys_ram_to_nasid(paddr) + << uv_hub_info->gpa_shift; return paddr; } - /* socket virtual --> UV global physical address */ static inline unsigned long uv_gpa(void *v) { @@ -338,54 +516,89 @@ static inline unsigned long uv_gpa_to_soc_phys_ram(unsigned long gpa) unsigned long paddr; unsigned long remap_base = uv_hub_info->lowmem_remap_base; unsigned long remap_top = uv_hub_info->lowmem_remap_top; + unsigned int m_val = uv_hub_info->m_val; + + if (m_val) + gpa = ((gpa << uv_hub_info->m_shift) >> uv_hub_info->m_shift) | + ((gpa >> uv_hub_info->n_lshift) << uv_hub_info->m_val); - gpa = ((gpa << uv_hub_info->m_shift) >> uv_hub_info->m_shift) | - ((gpa >> uv_hub_info->n_lshift) << uv_hub_info->m_val); paddr = gpa & uv_hub_info->gpa_mask; if (paddr >= remap_base && paddr < remap_base + remap_top) paddr -= remap_base; return paddr; } - -/* gpa -> pnode */ +/* gpa -> gnode */ static inline unsigned long uv_gpa_to_gnode(unsigned long gpa) { - return gpa >> uv_hub_info->n_lshift; + unsigned int n_lshift = uv_hub_info->n_lshift; + + if (n_lshift) + return gpa >> n_lshift; + + return uv_gam_range(gpa)->nasid >> 1; } /* gpa -> pnode */ static inline int uv_gpa_to_pnode(unsigned long gpa) { - unsigned long n_mask = (1UL << uv_hub_info->n_val) - 1; - - return uv_gpa_to_gnode(gpa) & n_mask; + return uv_gpa_to_gnode(gpa) & uv_hub_info->pnode_mask; } -/* gpa -> node offset*/ +/* gpa -> node offset */ static inline unsigned long uv_gpa_to_offset(unsigned long gpa) { - return (gpa << uv_hub_info->m_shift) >> uv_hub_info->m_shift; + unsigned int m_shift = uv_hub_info->m_shift; + + if (m_shift) + return (gpa << m_shift) >> m_shift; + + return (gpa & uv_hub_info->gpa_mask) - uv_gam_range_base(gpa); +} + +/* Convert socket to node */ +static inline int _uv_socket_to_node(int socket, unsigned short *s2nid) +{ + return s2nid ? s2nid[socket - uv_hub_info->min_socket] : socket; +} + +static inline int uv_socket_to_node(int socket) +{ + return _uv_socket_to_node(socket, uv_hub_info->socket_to_node); } /* pnode, offset --> socket virtual */ static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset) { - return __va(((unsigned long)pnode << uv_hub_info->m_val) | offset); -} + unsigned int m_val = uv_hub_info->m_val; + unsigned long base; + unsigned short sockid, node, *p2s; + if (m_val) + return __va(((unsigned long)pnode << m_val) | offset); -/* - * Extract a PNODE from an APICID (full apicid, not processor subset) - */ + p2s = uv_hub_info->pnode_to_socket; + sockid = p2s ? p2s[pnode - uv_hub_info->min_pnode] : pnode; + node = uv_socket_to_node(sockid); + + /* limit address of previous socket is our base, except node 0 is 0 */ + if (!node) + return __va((unsigned long)offset); + + base = (unsigned long)(uv_hub_info->gr_table[node - 1].limit); + return __va(base << UV_GAM_RANGE_SHFT | offset); +} + +/* Extract/Convert a PNODE from an APICID (full apicid, not processor subset) */ static inline int uv_apicid_to_pnode(int apicid) { - return (apicid >> uv_hub_info->apic_pnode_shift); + int pnode = apicid >> uv_hub_info->apic_pnode_shift; + unsigned short *s2pn = uv_hub_info->socket_to_pnode; + + return s2pn ? s2pn[pnode - uv_hub_info->min_socket] : pnode; } -/* - * Convert an apicid to the socket number on the blade - */ +/* Convert an apicid to the socket number on the blade */ static inline int uv_apicid_to_socket(int apicid) { if (is_uv1_hub()) @@ -434,16 +647,6 @@ static inline unsigned long uv_read_global_mmr64(int pnode, unsigned long offset return readq(uv_global_mmr64_address(pnode, offset)); } -/* - * Global MMR space addresses when referenced by the GRU. (GRU does - * NOT use socket addressing). - */ -static inline unsigned long uv_global_gru_mmr_address(int pnode, unsigned long offset) -{ - return UV_GLOBAL_GRU_MMR_BASE | offset | - ((unsigned long)pnode << uv_hub_info->m_val); -} - static inline void uv_write_global_mmr8(int pnode, unsigned long offset, unsigned char val) { writeb(val, uv_global_mmr64_address(pnode, offset)); @@ -483,27 +686,23 @@ static inline void uv_write_local_mmr8(unsigned long offset, unsigned char val) writeb(val, uv_local_mmr_address(offset)); } -/* - * Structures and definitions for converting between cpu, node, pnode, and blade - * numbers. - */ -struct uv_blade_info { - unsigned short nr_possible_cpus; - unsigned short nr_online_cpus; - unsigned short pnode; - short memory_nid; - spinlock_t nmi_lock; /* obsolete, see uv_hub_nmi */ - unsigned long nmi_count; /* obsolete, see uv_hub_nmi */ -}; -extern struct uv_blade_info *uv_blade_info; -extern short *uv_node_to_blade; -extern short *uv_cpu_to_blade; -extern short uv_possible_blades; - /* Blade-local cpu number of current cpu. Numbered 0 .. <# cpus on the blade> */ static inline int uv_blade_processor_id(void) { - return uv_hub_info->blade_processor_id; + return uv_cpu_info->blade_cpu_id; +} + +/* Blade-local cpu number of cpu N. Numbered 0 .. <# cpus on the blade> */ +static inline int uv_cpu_blade_processor_id(int cpu) +{ + return uv_cpu_info_per(cpu)->blade_cpu_id; +} +#define _uv_cpu_blade_processor_id 1 /* indicate function available */ + +/* Blade number to Node number (UV1..UV4 is 1:1) */ +static inline int uv_blade_to_node(int blade) +{ + return blade; } /* Blade number of current cpu. Numnbered 0 .. <#blades -1> */ @@ -512,55 +711,60 @@ static inline int uv_numa_blade_id(void) return uv_hub_info->numa_blade_id; } -/* Convert a cpu number to the the UV blade number */ -static inline int uv_cpu_to_blade_id(int cpu) +/* + * Convert linux node number to the UV blade number. + * .. Currently for UV1 thru UV4 the node and the blade are identical. + * .. If this changes then you MUST check references to this function! + */ +static inline int uv_node_to_blade_id(int nid) { - return uv_cpu_to_blade[cpu]; + return nid; } -/* Convert linux node number to the UV blade number */ -static inline int uv_node_to_blade_id(int nid) +/* Convert a cpu number to the the UV blade number */ +static inline int uv_cpu_to_blade_id(int cpu) { - return uv_node_to_blade[nid]; + return uv_node_to_blade_id(cpu_to_node(cpu)); } /* Convert a blade id to the PNODE of the blade */ static inline int uv_blade_to_pnode(int bid) { - return uv_blade_info[bid].pnode; + return uv_hub_info_list(uv_blade_to_node(bid))->pnode; } /* Nid of memory node on blade. -1 if no blade-local memory */ static inline int uv_blade_to_memory_nid(int bid) { - return uv_blade_info[bid].memory_nid; + return uv_hub_info_list(uv_blade_to_node(bid))->memory_nid; } /* Determine the number of possible cpus on a blade */ static inline int uv_blade_nr_possible_cpus(int bid) { - return uv_blade_info[bid].nr_possible_cpus; + return uv_hub_info_list(uv_blade_to_node(bid))->nr_possible_cpus; } /* Determine the number of online cpus on a blade */ static inline int uv_blade_nr_online_cpus(int bid) { - return uv_blade_info[bid].nr_online_cpus; + return uv_hub_info_list(uv_blade_to_node(bid))->nr_online_cpus; } /* Convert a cpu id to the PNODE of the blade containing the cpu */ static inline int uv_cpu_to_pnode(int cpu) { - return uv_blade_info[uv_cpu_to_blade_id(cpu)].pnode; + return uv_cpu_hub_info(cpu)->pnode; } /* Convert a linux node number to the PNODE of the blade */ static inline int uv_node_to_pnode(int nid) { - return uv_blade_info[uv_node_to_blade_id(nid)].pnode; + return uv_hub_info_list(nid)->pnode; } /* Maximum possible number of blades */ +extern short uv_possible_blades; static inline int uv_num_possible_blades(void) { return uv_possible_blades; @@ -578,9 +782,7 @@ extern void uv_nmi_setup(void); /* Newer SMM NMI handler, not present in all systems */ #define UVH_NMI_MMRX UVH_EVENT_OCCURRED0 #define UVH_NMI_MMRX_CLEAR UVH_EVENT_OCCURRED0_ALIAS -#define UVH_NMI_MMRX_SHIFT (is_uv1_hub() ? \ - UV1H_EVENT_OCCURRED0_EXTIO_INT0_SHFT :\ - UVXH_EVENT_OCCURRED0_EXTIO_INT0_SHFT) +#define UVH_NMI_MMRX_SHIFT UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT #define UVH_NMI_MMRX_TYPE "EXTIO_INT0" /* Non-zero indicates newer SMM NMI handler present */ @@ -622,9 +824,9 @@ DECLARE_PER_CPU(struct uv_cpu_nmi_s, uv_cpu_nmi); /* Update SCIR state */ static inline void uv_set_scir_bits(unsigned char value) { - if (uv_hub_info->scir.state != value) { - uv_hub_info->scir.state = value; - uv_write_local_mmr8(uv_hub_info->scir.offset, value); + if (uv_scir_info->state != value) { + uv_scir_info->state = value; + uv_write_local_mmr8(uv_scir_info->offset, value); } } @@ -635,10 +837,10 @@ static inline unsigned long uv_scir_offset(int apicid) static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value) { - if (uv_cpu_hub_info(cpu)->scir.state != value) { + if (uv_cpu_scir_info(cpu)->state != value) { uv_write_global_mmr8(uv_cpu_to_pnode(cpu), - uv_cpu_hub_info(cpu)->scir.offset, value); - uv_cpu_hub_info(cpu)->scir.state = value; + uv_cpu_scir_info(cpu)->offset, value); + uv_cpu_scir_info(cpu)->state = value; } } @@ -666,10 +868,7 @@ static inline void uv_hub_send_ipi(int pnode, int apicid, int vector) /* * Get the minimum revision number of the hub chips within the partition. - * 1 - UV1 rev 1.0 initial silicon - * 2 - UV1 rev 2.0 production silicon - * 3 - UV2 rev 1.0 initial silicon - * 5 - UV3 rev 1.0 initial silicon + * (See UVx_HUB_REVISION_BASE above for specific values.) */ static inline int uv_get_min_hub_revision_id(void) { diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h index ddd8db6b6e70..548d684a7960 100644 --- a/arch/x86/include/asm/uv/uv_mmrs.h +++ b/arch/x86/include/asm/uv/uv_mmrs.h @@ -5,7 +5,7 @@ * * SGI UV MMR definitions * - * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2016 Silicon Graphics, Inc. All rights reserved. */ #ifndef _ASM_X86_UV_UV_MMRS_H @@ -18,10 +18,11 @@ * grouped by architecture types. * * UVH - definitions common to all UV hub types. - * UVXH - definitions common to all UV eXtended hub types (currently 2 & 3). + * UVXH - definitions common to all UV eXtended hub types (currently 2, 3, 4). * UV1H - definitions specific to UV type 1 hub. * UV2H - definitions specific to UV type 2 hub. * UV3H - definitions specific to UV type 3 hub. + * UV4H - definitions specific to UV type 4 hub. * * So in general, MMR addresses and structures are identical on all hubs types. * These MMRs are identified as: @@ -32,19 +33,25 @@ * } s; * }; * - * If the MMR exists on all hub types but have different addresses: + * If the MMR exists on all hub types but have different addresses, + * use a conditional operator to define the value at runtime. * #define UV1Hxxx a * #define UV2Hxxx b * #define UV3Hxxx c + * #define UV4Hxxx d * #define UVHxxx (is_uv1_hub() ? UV1Hxxx : * (is_uv2_hub() ? UV2Hxxx : - * UV3Hxxx)) + * (is_uv3_hub() ? UV3Hxxx : + * UV4Hxxx)) * - * If the MMR exists on all hub types > 1 but have different addresses: + * If the MMR exists on all hub types > 1 but have different addresses, the + * variation using "UVX" as the prefix exists. * #define UV2Hxxx b * #define UV3Hxxx c - * #define UVXHxxx (is_uv2_hub() ? UV2Hxxx : - * UV3Hxxx)) + * #define UV4Hxxx d + * #define UVHxxx (is_uv2_hub() ? UV2Hxxx : + * (is_uv3_hub() ? UV3Hxxx : + * UV4Hxxx)) * * union uvh_xxx { * unsigned long v; @@ -56,6 +63,8 @@ * } s2; * struct uv3h_xxx_s { # Full UV3 definition (*) * } s3; + * struct uv4h_xxx_s { # Full UV4 definition (*) + * } s4; * }; * (* - if present and different than the common struct) * @@ -73,7 +82,7 @@ * } sn; * }; * - * (GEN Flags: mflags_opt= undefs=0 UV23=UVXH) + * (GEN Flags: mflags_opt= undefs=function UV234=UVXH) */ #define UV_MMR_ENABLE (1UL << 63) @@ -83,20 +92,36 @@ #define UV2_HUB_PART_NUMBER_X 0x1111 #define UV3_HUB_PART_NUMBER 0x9578 #define UV3_HUB_PART_NUMBER_X 0x4321 +#define UV4_HUB_PART_NUMBER 0x99a1 /* Compat: Indicate which UV Hubs are supported. */ +#define UV1_HUB_IS_SUPPORTED 1 #define UV2_HUB_IS_SUPPORTED 1 #define UV3_HUB_IS_SUPPORTED 1 +#define UV4_HUB_IS_SUPPORTED 1 + +/* Error function to catch undefined references */ +extern unsigned long uv_undefined(char *str); /* ========================================================================= */ /* UVH_BAU_DATA_BROADCAST */ /* ========================================================================= */ #define UVH_BAU_DATA_BROADCAST 0x61688UL -#define UVH_BAU_DATA_BROADCAST_32 0x440 + +#define UV1H_BAU_DATA_BROADCAST_32 0x440 +#define UV2H_BAU_DATA_BROADCAST_32 0x440 +#define UV3H_BAU_DATA_BROADCAST_32 0x440 +#define UV4H_BAU_DATA_BROADCAST_32 0x360 +#define UVH_BAU_DATA_BROADCAST_32 ( \ + is_uv1_hub() ? UV1H_BAU_DATA_BROADCAST_32 : \ + is_uv2_hub() ? UV2H_BAU_DATA_BROADCAST_32 : \ + is_uv3_hub() ? UV3H_BAU_DATA_BROADCAST_32 : \ + /*is_uv4_hub*/ UV4H_BAU_DATA_BROADCAST_32) #define UVH_BAU_DATA_BROADCAST_ENABLE_SHFT 0 #define UVH_BAU_DATA_BROADCAST_ENABLE_MASK 0x0000000000000001UL + union uvh_bau_data_broadcast_u { unsigned long v; struct uvh_bau_data_broadcast_s { @@ -109,7 +134,16 @@ union uvh_bau_data_broadcast_u { /* UVH_BAU_DATA_CONFIG */ /* ========================================================================= */ #define UVH_BAU_DATA_CONFIG 0x61680UL -#define UVH_BAU_DATA_CONFIG_32 0x438 + +#define UV1H_BAU_DATA_CONFIG_32 0x438 +#define UV2H_BAU_DATA_CONFIG_32 0x438 +#define UV3H_BAU_DATA_CONFIG_32 0x438 +#define UV4H_BAU_DATA_CONFIG_32 0x358 +#define UVH_BAU_DATA_CONFIG_32 ( \ + is_uv1_hub() ? UV1H_BAU_DATA_CONFIG_32 : \ + is_uv2_hub() ? UV2H_BAU_DATA_CONFIG_32 : \ + is_uv3_hub() ? UV3H_BAU_DATA_CONFIG_32 : \ + /*is_uv4_hub*/ UV4H_BAU_DATA_CONFIG_32) #define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0 #define UVH_BAU_DATA_CONFIG_DM_SHFT 8 @@ -128,6 +162,7 @@ union uvh_bau_data_broadcast_u { #define UVH_BAU_DATA_CONFIG_M_MASK 0x0000000000010000UL #define UVH_BAU_DATA_CONFIG_APIC_ID_MASK 0xffffffff00000000UL + union uvh_bau_data_config_u { unsigned long v; struct uvh_bau_data_config_s { @@ -266,7 +301,6 @@ union uvh_bau_data_config_u { #define UV1H_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL #define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL -#define UVXH_EVENT_OCCURRED0_QP_HCERR_SHFT 1 #define UVXH_EVENT_OCCURRED0_RH_HCERR_SHFT 2 #define UVXH_EVENT_OCCURRED0_LH0_HCERR_SHFT 3 #define UVXH_EVENT_OCCURRED0_LH1_HCERR_SHFT 4 @@ -275,55 +309,11 @@ union uvh_bau_data_config_u { #define UVXH_EVENT_OCCURRED0_NI0_HCERR_SHFT 7 #define UVXH_EVENT_OCCURRED0_NI1_HCERR_SHFT 8 #define UVXH_EVENT_OCCURRED0_LB_AOERR0_SHFT 9 -#define UVXH_EVENT_OCCURRED0_QP_AOERR0_SHFT 10 #define UVXH_EVENT_OCCURRED0_LH0_AOERR0_SHFT 12 #define UVXH_EVENT_OCCURRED0_LH1_AOERR0_SHFT 13 #define UVXH_EVENT_OCCURRED0_GR0_AOERR0_SHFT 14 #define UVXH_EVENT_OCCURRED0_GR1_AOERR0_SHFT 15 #define UVXH_EVENT_OCCURRED0_XB_AOERR0_SHFT 16 -#define UVXH_EVENT_OCCURRED0_RT_AOERR0_SHFT 17 -#define UVXH_EVENT_OCCURRED0_NI0_AOERR0_SHFT 18 -#define UVXH_EVENT_OCCURRED0_NI1_AOERR0_SHFT 19 -#define UVXH_EVENT_OCCURRED0_LB_AOERR1_SHFT 20 -#define UVXH_EVENT_OCCURRED0_QP_AOERR1_SHFT 21 -#define UVXH_EVENT_OCCURRED0_RH_AOERR1_SHFT 22 -#define UVXH_EVENT_OCCURRED0_LH0_AOERR1_SHFT 23 -#define UVXH_EVENT_OCCURRED0_LH1_AOERR1_SHFT 24 -#define UVXH_EVENT_OCCURRED0_GR0_AOERR1_SHFT 25 -#define UVXH_EVENT_OCCURRED0_GR1_AOERR1_SHFT 26 -#define UVXH_EVENT_OCCURRED0_XB_AOERR1_SHFT 27 -#define UVXH_EVENT_OCCURRED0_RT_AOERR1_SHFT 28 -#define UVXH_EVENT_OCCURRED0_NI0_AOERR1_SHFT 29 -#define UVXH_EVENT_OCCURRED0_NI1_AOERR1_SHFT 30 -#define UVXH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 31 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 32 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 33 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 34 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 35 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 36 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 37 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 38 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 39 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 40 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 41 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 42 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 43 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 44 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 45 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 46 -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 47 -#define UVXH_EVENT_OCCURRED0_L1_NMI_INT_SHFT 48 -#define UVXH_EVENT_OCCURRED0_STOP_CLOCK_SHFT 49 -#define UVXH_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 50 -#define UVXH_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 51 -#define UVXH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 52 -#define UVXH_EVENT_OCCURRED0_IPI_INT_SHFT 53 -#define UVXH_EVENT_OCCURRED0_EXTIO_INT0_SHFT 54 -#define UVXH_EVENT_OCCURRED0_EXTIO_INT1_SHFT 55 -#define UVXH_EVENT_OCCURRED0_EXTIO_INT2_SHFT 56 -#define UVXH_EVENT_OCCURRED0_EXTIO_INT3_SHFT 57 -#define UVXH_EVENT_OCCURRED0_PROFILE_INT_SHFT 58 -#define UVXH_EVENT_OCCURRED0_QP_HCERR_MASK 0x0000000000000002UL #define UVXH_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000004UL #define UVXH_EVENT_OCCURRED0_LH0_HCERR_MASK 0x0000000000000008UL #define UVXH_EVENT_OCCURRED0_LH1_HCERR_MASK 0x0000000000000010UL @@ -332,54 +322,294 @@ union uvh_bau_data_config_u { #define UVXH_EVENT_OCCURRED0_NI0_HCERR_MASK 0x0000000000000080UL #define UVXH_EVENT_OCCURRED0_NI1_HCERR_MASK 0x0000000000000100UL #define UVXH_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000200UL -#define UVXH_EVENT_OCCURRED0_QP_AOERR0_MASK 0x0000000000000400UL #define UVXH_EVENT_OCCURRED0_LH0_AOERR0_MASK 0x0000000000001000UL #define UVXH_EVENT_OCCURRED0_LH1_AOERR0_MASK 0x0000000000002000UL #define UVXH_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000004000UL #define UVXH_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000008000UL #define UVXH_EVENT_OCCURRED0_XB_AOERR0_MASK 0x0000000000010000UL -#define UVXH_EVENT_OCCURRED0_RT_AOERR0_MASK 0x0000000000020000UL -#define UVXH_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000040000UL -#define UVXH_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000080000UL -#define UVXH_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000100000UL -#define UVXH_EVENT_OCCURRED0_QP_AOERR1_MASK 0x0000000000200000UL -#define UVXH_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000400000UL -#define UVXH_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000000800000UL -#define UVXH_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000001000000UL -#define UVXH_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000002000000UL -#define UVXH_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000004000000UL -#define UVXH_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000008000000UL -#define UVXH_EVENT_OCCURRED0_RT_AOERR1_MASK 0x0000000010000000UL -#define UVXH_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000020000000UL -#define UVXH_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000000040000000UL -#define UVXH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000080000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000100000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000200000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000400000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000800000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000001000000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000002000000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000004000000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000008000000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000010000000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000020000000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000040000000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000080000000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000100000000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000200000000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000400000000000UL -#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000800000000000UL -#define UVXH_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0001000000000000UL -#define UVXH_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0002000000000000UL -#define UVXH_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0004000000000000UL -#define UVXH_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0008000000000000UL -#define UVXH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0010000000000000UL -#define UVXH_EVENT_OCCURRED0_IPI_INT_MASK 0x0020000000000000UL -#define UVXH_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0040000000000000UL -#define UVXH_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0080000000000000UL -#define UVXH_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0100000000000000UL -#define UVXH_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0200000000000000UL -#define UVXH_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0400000000000000UL + +#define UV2H_EVENT_OCCURRED0_QP_HCERR_SHFT 1 +#define UV2H_EVENT_OCCURRED0_QP_AOERR0_SHFT 10 +#define UV2H_EVENT_OCCURRED0_RT_AOERR0_SHFT 17 +#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 18 +#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 19 +#define UV2H_EVENT_OCCURRED0_LB_AOERR1_SHFT 20 +#define UV2H_EVENT_OCCURRED0_QP_AOERR1_SHFT 21 +#define UV2H_EVENT_OCCURRED0_RH_AOERR1_SHFT 22 +#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 23 +#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 24 +#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 25 +#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 26 +#define UV2H_EVENT_OCCURRED0_XB_AOERR1_SHFT 27 +#define UV2H_EVENT_OCCURRED0_RT_AOERR1_SHFT 28 +#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 29 +#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 30 +#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 31 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 32 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 33 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 34 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 35 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 36 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 37 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 38 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 39 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 40 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 41 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 42 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 43 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 44 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 45 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 46 +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 47 +#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 48 +#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 49 +#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 50 +#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 51 +#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 52 +#define UV2H_EVENT_OCCURRED0_IPI_INT_SHFT 53 +#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 54 +#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 55 +#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 56 +#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 57 +#define UV2H_EVENT_OCCURRED0_PROFILE_INT_SHFT 58 +#define UV2H_EVENT_OCCURRED0_QP_HCERR_MASK 0x0000000000000002UL +#define UV2H_EVENT_OCCURRED0_QP_AOERR0_MASK 0x0000000000000400UL +#define UV2H_EVENT_OCCURRED0_RT_AOERR0_MASK 0x0000000000020000UL +#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000040000UL +#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000080000UL +#define UV2H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000100000UL +#define UV2H_EVENT_OCCURRED0_QP_AOERR1_MASK 0x0000000000200000UL +#define UV2H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000400000UL +#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000000800000UL +#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000001000000UL +#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000002000000UL +#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000004000000UL +#define UV2H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000008000000UL +#define UV2H_EVENT_OCCURRED0_RT_AOERR1_MASK 0x0000000010000000UL +#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000020000000UL +#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000000040000000UL +#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000080000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000100000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000200000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000400000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000800000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000001000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000002000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000004000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000008000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000010000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000020000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000040000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000080000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000100000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000200000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000400000000000UL +#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000800000000000UL +#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0001000000000000UL +#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0002000000000000UL +#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0004000000000000UL +#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0008000000000000UL +#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0010000000000000UL +#define UV2H_EVENT_OCCURRED0_IPI_INT_MASK 0x0020000000000000UL +#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0040000000000000UL +#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0080000000000000UL +#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0100000000000000UL +#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0200000000000000UL +#define UV2H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0400000000000000UL + +#define UV3H_EVENT_OCCURRED0_QP_HCERR_SHFT 1 +#define UV3H_EVENT_OCCURRED0_QP_AOERR0_SHFT 10 +#define UV3H_EVENT_OCCURRED0_RT_AOERR0_SHFT 17 +#define UV3H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 18 +#define UV3H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 19 +#define UV3H_EVENT_OCCURRED0_LB_AOERR1_SHFT 20 +#define UV3H_EVENT_OCCURRED0_QP_AOERR1_SHFT 21 +#define UV3H_EVENT_OCCURRED0_RH_AOERR1_SHFT 22 +#define UV3H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 23 +#define UV3H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 24 +#define UV3H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 25 +#define UV3H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 26 +#define UV3H_EVENT_OCCURRED0_XB_AOERR1_SHFT 27 +#define UV3H_EVENT_OCCURRED0_RT_AOERR1_SHFT 28 +#define UV3H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 29 +#define UV3H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 30 +#define UV3H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 31 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 32 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 33 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 34 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 35 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 36 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 37 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 38 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 39 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 40 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 41 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 42 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 43 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 44 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 45 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 46 +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 47 +#define UV3H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 48 +#define UV3H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 49 +#define UV3H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 50 +#define UV3H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 51 +#define UV3H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 52 +#define UV3H_EVENT_OCCURRED0_IPI_INT_SHFT 53 +#define UV3H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 54 +#define UV3H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 55 +#define UV3H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 56 +#define UV3H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 57 +#define UV3H_EVENT_OCCURRED0_PROFILE_INT_SHFT 58 +#define UV3H_EVENT_OCCURRED0_QP_HCERR_MASK 0x0000000000000002UL +#define UV3H_EVENT_OCCURRED0_QP_AOERR0_MASK 0x0000000000000400UL +#define UV3H_EVENT_OCCURRED0_RT_AOERR0_MASK 0x0000000000020000UL +#define UV3H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000040000UL +#define UV3H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000080000UL +#define UV3H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000100000UL +#define UV3H_EVENT_OCCURRED0_QP_AOERR1_MASK 0x0000000000200000UL +#define UV3H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000400000UL +#define UV3H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000000800000UL +#define UV3H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000001000000UL +#define UV3H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000002000000UL +#define UV3H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000004000000UL +#define UV3H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000008000000UL +#define UV3H_EVENT_OCCURRED0_RT_AOERR1_MASK 0x0000000010000000UL +#define UV3H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000020000000UL +#define UV3H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000000040000000UL +#define UV3H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000080000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000100000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000200000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000400000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000800000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000001000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000002000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000004000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000008000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000010000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000020000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000040000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000080000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000100000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000200000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000400000000000UL +#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000800000000000UL +#define UV3H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0001000000000000UL +#define UV3H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0002000000000000UL +#define UV3H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0004000000000000UL +#define UV3H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0008000000000000UL +#define UV3H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0010000000000000UL +#define UV3H_EVENT_OCCURRED0_IPI_INT_MASK 0x0020000000000000UL +#define UV3H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0040000000000000UL +#define UV3H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0080000000000000UL +#define UV3H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0100000000000000UL +#define UV3H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0200000000000000UL +#define UV3H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0400000000000000UL + +#define UV4H_EVENT_OCCURRED0_KT_HCERR_SHFT 1 +#define UV4H_EVENT_OCCURRED0_KT_AOERR0_SHFT 10 +#define UV4H_EVENT_OCCURRED0_RTQ0_AOERR0_SHFT 17 +#define UV4H_EVENT_OCCURRED0_RTQ1_AOERR0_SHFT 18 +#define UV4H_EVENT_OCCURRED0_RTQ2_AOERR0_SHFT 19 +#define UV4H_EVENT_OCCURRED0_RTQ3_AOERR0_SHFT 20 +#define UV4H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 21 +#define UV4H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 22 +#define UV4H_EVENT_OCCURRED0_LB_AOERR1_SHFT 23 +#define UV4H_EVENT_OCCURRED0_KT_AOERR1_SHFT 24 +#define UV4H_EVENT_OCCURRED0_RH_AOERR1_SHFT 25 +#define UV4H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 26 +#define UV4H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 27 +#define UV4H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 28 +#define UV4H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 29 +#define UV4H_EVENT_OCCURRED0_XB_AOERR1_SHFT 30 +#define UV4H_EVENT_OCCURRED0_RTQ0_AOERR1_SHFT 31 +#define UV4H_EVENT_OCCURRED0_RTQ1_AOERR1_SHFT 32 +#define UV4H_EVENT_OCCURRED0_RTQ2_AOERR1_SHFT 33 +#define UV4H_EVENT_OCCURRED0_RTQ3_AOERR1_SHFT 34 +#define UV4H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 35 +#define UV4H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 36 +#define UV4H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 37 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 38 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 39 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 40 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 41 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 42 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 43 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 44 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 45 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 46 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 47 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 48 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 49 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 50 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 51 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 52 +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 53 +#define UV4H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 54 +#define UV4H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 55 +#define UV4H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 56 +#define UV4H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 57 +#define UV4H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 58 +#define UV4H_EVENT_OCCURRED0_IPI_INT_SHFT 59 +#define UV4H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 60 +#define UV4H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 61 +#define UV4H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 62 +#define UV4H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 63 +#define UV4H_EVENT_OCCURRED0_KT_HCERR_MASK 0x0000000000000002UL +#define UV4H_EVENT_OCCURRED0_KT_AOERR0_MASK 0x0000000000000400UL +#define UV4H_EVENT_OCCURRED0_RTQ0_AOERR0_MASK 0x0000000000020000UL +#define UV4H_EVENT_OCCURRED0_RTQ1_AOERR0_MASK 0x0000000000040000UL +#define UV4H_EVENT_OCCURRED0_RTQ2_AOERR0_MASK 0x0000000000080000UL +#define UV4H_EVENT_OCCURRED0_RTQ3_AOERR0_MASK 0x0000000000100000UL +#define UV4H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000200000UL +#define UV4H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000400000UL +#define UV4H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000800000UL +#define UV4H_EVENT_OCCURRED0_KT_AOERR1_MASK 0x0000000001000000UL +#define UV4H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000002000000UL +#define UV4H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000004000000UL +#define UV4H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000008000000UL +#define UV4H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000010000000UL +#define UV4H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000020000000UL +#define UV4H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000040000000UL +#define UV4H_EVENT_OCCURRED0_RTQ0_AOERR1_MASK 0x0000000080000000UL +#define UV4H_EVENT_OCCURRED0_RTQ1_AOERR1_MASK 0x0000000100000000UL +#define UV4H_EVENT_OCCURRED0_RTQ2_AOERR1_MASK 0x0000000200000000UL +#define UV4H_EVENT_OCCURRED0_RTQ3_AOERR1_MASK 0x0000000400000000UL +#define UV4H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000800000000UL +#define UV4H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000001000000000UL +#define UV4H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000002000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000004000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000008000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000010000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000020000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000040000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000080000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000100000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000200000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000400000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000800000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0001000000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0002000000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0004000000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0008000000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0010000000000000UL +#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0020000000000000UL +#define UV4H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0040000000000000UL +#define UV4H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0080000000000000UL +#define UV4H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0100000000000000UL +#define UV4H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0200000000000000UL +#define UV4H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0400000000000000UL +#define UV4H_EVENT_OCCURRED0_IPI_INT_MASK 0x0800000000000000UL +#define UV4H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x1000000000000000UL +#define UV4H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x2000000000000000UL +#define UV4H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x4000000000000000UL +#define UV4H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x8000000000000000UL + +#define UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT ( \ + is_uv1_hub() ? UV1H_EVENT_OCCURRED0_EXTIO_INT0_SHFT : \ + is_uv2_hub() ? UV2H_EVENT_OCCURRED0_EXTIO_INT0_SHFT : \ + is_uv3_hub() ? UV3H_EVENT_OCCURRED0_EXTIO_INT0_SHFT : \ + /*is_uv4_hub*/ UV4H_EVENT_OCCURRED0_EXTIO_INT0_SHFT) union uvh_event_occurred0_u { unsigned long v; @@ -391,7 +621,7 @@ union uvh_event_occurred0_u { } s; struct uvxh_event_occurred0_s { unsigned long lb_hcerr:1; /* RW */ - unsigned long qp_hcerr:1; /* RW */ + unsigned long rsvd_1:1; unsigned long rh_hcerr:1; /* RW */ unsigned long lh0_hcerr:1; /* RW */ unsigned long lh1_hcerr:1; /* RW */ @@ -400,25 +630,51 @@ union uvh_event_occurred0_u { unsigned long ni0_hcerr:1; /* RW */ unsigned long ni1_hcerr:1; /* RW */ unsigned long lb_aoerr0:1; /* RW */ - unsigned long qp_aoerr0:1; /* RW */ + unsigned long rsvd_10:1; unsigned long rh_aoerr0:1; /* RW */ unsigned long lh0_aoerr0:1; /* RW */ unsigned long lh1_aoerr0:1; /* RW */ unsigned long gr0_aoerr0:1; /* RW */ unsigned long gr1_aoerr0:1; /* RW */ unsigned long xb_aoerr0:1; /* RW */ - unsigned long rt_aoerr0:1; /* RW */ + unsigned long rsvd_17_63:47; + } sx; + struct uv4h_event_occurred0_s { + unsigned long lb_hcerr:1; /* RW */ + unsigned long kt_hcerr:1; /* RW */ + unsigned long rh_hcerr:1; /* RW */ + unsigned long lh0_hcerr:1; /* RW */ + unsigned long lh1_hcerr:1; /* RW */ + unsigned long gr0_hcerr:1; /* RW */ + unsigned long gr1_hcerr:1; /* RW */ + unsigned long ni0_hcerr:1; /* RW */ + unsigned long ni1_hcerr:1; /* RW */ + unsigned long lb_aoerr0:1; /* RW */ + unsigned long kt_aoerr0:1; /* RW */ + unsigned long rh_aoerr0:1; /* RW */ + unsigned long lh0_aoerr0:1; /* RW */ + unsigned long lh1_aoerr0:1; /* RW */ + unsigned long gr0_aoerr0:1; /* RW */ + unsigned long gr1_aoerr0:1; /* RW */ + unsigned long xb_aoerr0:1; /* RW */ + unsigned long rtq0_aoerr0:1; /* RW */ + unsigned long rtq1_aoerr0:1; /* RW */ + unsigned long rtq2_aoerr0:1; /* RW */ + unsigned long rtq3_aoerr0:1; /* RW */ unsigned long ni0_aoerr0:1; /* RW */ unsigned long ni1_aoerr0:1; /* RW */ unsigned long lb_aoerr1:1; /* RW */ - unsigned long qp_aoerr1:1; /* RW */ + unsigned long kt_aoerr1:1; /* RW */ unsigned long rh_aoerr1:1; /* RW */ unsigned long lh0_aoerr1:1; /* RW */ unsigned long lh1_aoerr1:1; /* RW */ unsigned long gr0_aoerr1:1; /* RW */ unsigned long gr1_aoerr1:1; /* RW */ unsigned long xb_aoerr1:1; /* RW */ - unsigned long rt_aoerr1:1; /* RW */ + unsigned long rtq0_aoerr1:1; /* RW */ + unsigned long rtq1_aoerr1:1; /* RW */ + unsigned long rtq2_aoerr1:1; /* RW */ + unsigned long rtq3_aoerr1:1; /* RW */ unsigned long ni0_aoerr1:1; /* RW */ unsigned long ni1_aoerr1:1; /* RW */ unsigned long system_shutdown_int:1; /* RW */ @@ -448,9 +704,7 @@ union uvh_event_occurred0_u { unsigned long extio_int1:1; /* RW */ unsigned long extio_int2:1; /* RW */ unsigned long extio_int3:1; /* RW */ - unsigned long profile_int:1; /* RW */ - unsigned long rsvd_59_63:5; - } sx; + } s4; }; /* ========================================================================= */ @@ -464,11 +718,21 @@ union uvh_event_occurred0_u { /* UVH_EXTIO_INT0_BROADCAST */ /* ========================================================================= */ #define UVH_EXTIO_INT0_BROADCAST 0x61448UL -#define UVH_EXTIO_INT0_BROADCAST_32 0x3f0 + +#define UV1H_EXTIO_INT0_BROADCAST_32 0x3f0 +#define UV2H_EXTIO_INT0_BROADCAST_32 0x3f0 +#define UV3H_EXTIO_INT0_BROADCAST_32 0x3f0 +#define UV4H_EXTIO_INT0_BROADCAST_32 0x310 +#define UVH_EXTIO_INT0_BROADCAST_32 ( \ + is_uv1_hub() ? UV1H_EXTIO_INT0_BROADCAST_32 : \ + is_uv2_hub() ? UV2H_EXTIO_INT0_BROADCAST_32 : \ + is_uv3_hub() ? UV3H_EXTIO_INT0_BROADCAST_32 : \ + /*is_uv4_hub*/ UV4H_EXTIO_INT0_BROADCAST_32) #define UVH_EXTIO_INT0_BROADCAST_ENABLE_SHFT 0 #define UVH_EXTIO_INT0_BROADCAST_ENABLE_MASK 0x0000000000000001UL + union uvh_extio_int0_broadcast_u { unsigned long v; struct uvh_extio_int0_broadcast_s { @@ -499,6 +763,7 @@ union uvh_extio_int0_broadcast_u { #define UVH_GR0_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL #define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL + union uvh_gr0_tlb_int0_config_u { unsigned long v; struct uvh_gr0_tlb_int0_config_s { @@ -537,6 +802,7 @@ union uvh_gr0_tlb_int0_config_u { #define UVH_GR0_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL #define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL + union uvh_gr0_tlb_int1_config_u { unsigned long v; struct uvh_gr0_tlb_int1_config_s { @@ -559,19 +825,18 @@ union uvh_gr0_tlb_int1_config_u { #define UV1H_GR0_TLB_MMR_CONTROL 0x401080UL #define UV2H_GR0_TLB_MMR_CONTROL 0xc01080UL #define UV3H_GR0_TLB_MMR_CONTROL 0xc01080UL -#define UVH_GR0_TLB_MMR_CONTROL \ - (is_uv1_hub() ? UV1H_GR0_TLB_MMR_CONTROL : \ - (is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL : \ - UV3H_GR0_TLB_MMR_CONTROL)) +#define UV4H_GR0_TLB_MMR_CONTROL 0x601080UL +#define UVH_GR0_TLB_MMR_CONTROL ( \ + is_uv1_hub() ? UV1H_GR0_TLB_MMR_CONTROL : \ + is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL : \ + is_uv3_hub() ? UV3H_GR0_TLB_MMR_CONTROL : \ + /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_CONTROL) #define UVH_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0 -#define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 #define UVH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 #define UVH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 #define UVH_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 #define UVH_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31 -#define UVH_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL -#define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL #define UVH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL #define UVH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL #define UVH_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL @@ -601,14 +866,11 @@ union uvh_gr0_tlb_int1_config_u { #define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_MASK 0x1000000000000000UL #define UVXH_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0 -#define UVXH_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 #define UVXH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 #define UVXH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 #define UVXH_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 #define UVXH_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31 #define UVXH_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32 -#define UVXH_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL -#define UVXH_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL #define UVXH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL #define UVXH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL #define UVXH_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL @@ -651,12 +913,45 @@ union uvh_gr0_tlb_int1_config_u { #define UV3H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL #define UV3H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL +#define UV4H_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0 +#define UV4H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 13 +#define UV4H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 +#define UV4H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 +#define UV4H_GR0_TLB_MMR_CONTROL_ECC_SEL_SHFT 21 +#define UV4H_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 +#define UV4H_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31 +#define UV4H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32 +#define UV4H_GR0_TLB_MMR_CONTROL_PAGE_SIZE_SHFT 59 +#define UV4H_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000001fffUL +#define UV4H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000006000UL +#define UV4H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL +#define UV4H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL +#define UV4H_GR0_TLB_MMR_CONTROL_ECC_SEL_MASK 0x0000000000200000UL +#define UV4H_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL +#define UV4H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL +#define UV4H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL +#define UV4H_GR0_TLB_MMR_CONTROL_PAGE_SIZE_MASK 0xf800000000000000UL + +#define UVH_GR0_TLB_MMR_CONTROL_INDEX_MASK ( \ + is_uv1_hub() ? UV1H_GR0_TLB_MMR_CONTROL_INDEX_MASK : \ + is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL_INDEX_MASK : \ + is_uv3_hub() ? UV3H_GR0_TLB_MMR_CONTROL_INDEX_MASK : \ + /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_CONTROL_INDEX_MASK) +#define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK ( \ + is_uv1_hub() ? UV1H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK : \ + is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK : \ + is_uv3_hub() ? UV3H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK : \ + /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK) +#define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT ( \ + is_uv1_hub() ? UV1H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT : \ + is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT : \ + is_uv3_hub() ? UV3H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT : \ + /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT) + union uvh_gr0_tlb_mmr_control_u { unsigned long v; struct uvh_gr0_tlb_mmr_control_s { - unsigned long index:12; /* RW */ - unsigned long mem_sel:2; /* RW */ - unsigned long rsvd_14_15:2; + unsigned long rsvd_0_15:16; unsigned long auto_valid_en:1; /* RW */ unsigned long rsvd_17_19:3; unsigned long mmr_hash_index_en:1; /* RW */ @@ -690,9 +985,7 @@ union uvh_gr0_tlb_mmr_control_u { unsigned long rsvd_61_63:3; } s1; struct uvxh_gr0_tlb_mmr_control_s { - unsigned long index:12; /* RW */ - unsigned long mem_sel:2; /* RW */ - unsigned long rsvd_14_15:2; + unsigned long rsvd_0_15:16; unsigned long auto_valid_en:1; /* RW */ unsigned long rsvd_17_19:3; unsigned long mmr_hash_index_en:1; /* RW */ @@ -703,8 +996,7 @@ union uvh_gr0_tlb_mmr_control_u { unsigned long rsvd_33_47:15; unsigned long rsvd_48:1; unsigned long rsvd_49_51:3; - unsigned long rsvd_52:1; - unsigned long rsvd_53_63:11; + unsigned long rsvd_52_63:12; } sx; struct uv2h_gr0_tlb_mmr_control_s { unsigned long index:12; /* RW */ @@ -741,6 +1033,24 @@ union uvh_gr0_tlb_mmr_control_u { unsigned long undef_52:1; /* Undefined */ unsigned long rsvd_53_63:11; } s3; + struct uv4h_gr0_tlb_mmr_control_s { + unsigned long index:13; /* RW */ + unsigned long mem_sel:2; /* RW */ + unsigned long rsvd_15:1; + unsigned long auto_valid_en:1; /* RW */ + unsigned long rsvd_17_19:3; + unsigned long mmr_hash_index_en:1; /* RW */ + unsigned long ecc_sel:1; /* RW */ + unsigned long rsvd_22_29:8; + unsigned long mmr_write:1; /* WP */ + unsigned long mmr_read:1; /* WP */ + unsigned long mmr_op_done:1; /* RW */ + unsigned long rsvd_33_47:15; + unsigned long undef_48:1; /* Undefined */ + unsigned long rsvd_49_51:3; + unsigned long rsvd_52_58:7; + unsigned long page_size:5; /* RW */ + } s4; }; /* ========================================================================= */ @@ -749,19 +1059,14 @@ union uvh_gr0_tlb_mmr_control_u { #define UV1H_GR0_TLB_MMR_READ_DATA_HI 0x4010a0UL #define UV2H_GR0_TLB_MMR_READ_DATA_HI 0xc010a0UL #define UV3H_GR0_TLB_MMR_READ_DATA_HI 0xc010a0UL -#define UVH_GR0_TLB_MMR_READ_DATA_HI \ - (is_uv1_hub() ? UV1H_GR0_TLB_MMR_READ_DATA_HI : \ - (is_uv2_hub() ? UV2H_GR0_TLB_MMR_READ_DATA_HI : \ - UV3H_GR0_TLB_MMR_READ_DATA_HI)) +#define UV4H_GR0_TLB_MMR_READ_DATA_HI 0x6010a0UL +#define UVH_GR0_TLB_MMR_READ_DATA_HI ( \ + is_uv1_hub() ? UV1H_GR0_TLB_MMR_READ_DATA_HI : \ + is_uv2_hub() ? UV2H_GR0_TLB_MMR_READ_DATA_HI : \ + is_uv3_hub() ? UV3H_GR0_TLB_MMR_READ_DATA_HI : \ + /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_READ_DATA_HI) #define UVH_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 -#define UVH_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 -#define UVH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 -#define UVH_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 -#define UVH_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL -#define UVH_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL -#define UVH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL -#define UVH_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL #define UV1H_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 #define UV1H_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 @@ -773,13 +1078,6 @@ union uvh_gr0_tlb_mmr_control_u { #define UV1H_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL #define UVXH_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 -#define UVXH_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 -#define UVXH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 -#define UVXH_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 -#define UVXH_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL -#define UVXH_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL -#define UVXH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL -#define UVXH_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL #define UV2H_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 #define UV2H_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 @@ -803,15 +1101,24 @@ union uvh_gr0_tlb_mmr_control_u { #define UV3H_GR0_TLB_MMR_READ_DATA_HI_AA_EXT_MASK 0x0000200000000000UL #define UV3H_GR0_TLB_MMR_READ_DATA_HI_WAY_ECC_MASK 0xff80000000000000UL +#define UV4H_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 +#define UV4H_GR0_TLB_MMR_READ_DATA_HI_PNID_SHFT 34 +#define UV4H_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 49 +#define UV4H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 51 +#define UV4H_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT 52 +#define UV4H_GR0_TLB_MMR_READ_DATA_HI_AA_EXT_SHFT 53 +#define UV4H_GR0_TLB_MMR_READ_DATA_HI_WAY_ECC_SHFT 55 +#define UV4H_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK 0x00000003ffffffffUL +#define UV4H_GR0_TLB_MMR_READ_DATA_HI_PNID_MASK 0x0001fffc00000000UL +#define UV4H_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0006000000000000UL +#define UV4H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0008000000000000UL +#define UV4H_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0010000000000000UL +#define UV4H_GR0_TLB_MMR_READ_DATA_HI_AA_EXT_MASK 0x0020000000000000UL +#define UV4H_GR0_TLB_MMR_READ_DATA_HI_WAY_ECC_MASK 0xff80000000000000UL + + union uvh_gr0_tlb_mmr_read_data_hi_u { unsigned long v; - struct uvh_gr0_tlb_mmr_read_data_hi_s { - unsigned long pfn:41; /* RO */ - unsigned long gaa:2; /* RO */ - unsigned long dirty:1; /* RO */ - unsigned long larger:1; /* RO */ - unsigned long rsvd_45_63:19; - } s; struct uv1h_gr0_tlb_mmr_read_data_hi_s { unsigned long pfn:41; /* RO */ unsigned long gaa:2; /* RO */ @@ -819,13 +1126,6 @@ union uvh_gr0_tlb_mmr_read_data_hi_u { unsigned long larger:1; /* RO */ unsigned long rsvd_45_63:19; } s1; - struct uvxh_gr0_tlb_mmr_read_data_hi_s { - unsigned long pfn:41; /* RO */ - unsigned long gaa:2; /* RO */ - unsigned long dirty:1; /* RO */ - unsigned long larger:1; /* RO */ - unsigned long rsvd_45_63:19; - } sx; struct uv2h_gr0_tlb_mmr_read_data_hi_s { unsigned long pfn:41; /* RO */ unsigned long gaa:2; /* RO */ @@ -842,6 +1142,16 @@ union uvh_gr0_tlb_mmr_read_data_hi_u { unsigned long undef_46_54:9; /* Undefined */ unsigned long way_ecc:9; /* RO */ } s3; + struct uv4h_gr0_tlb_mmr_read_data_hi_s { + unsigned long pfn:34; /* RO */ + unsigned long pnid:15; /* RO */ + unsigned long gaa:2; /* RO */ + unsigned long dirty:1; /* RO */ + unsigned long larger:1; /* RO */ + unsigned long aa_ext:1; /* RO */ + unsigned long undef_54:1; /* Undefined */ + unsigned long way_ecc:9; /* RO */ + } s4; }; /* ========================================================================= */ @@ -850,10 +1160,12 @@ union uvh_gr0_tlb_mmr_read_data_hi_u { #define UV1H_GR0_TLB_MMR_READ_DATA_LO 0x4010a8UL #define UV2H_GR0_TLB_MMR_READ_DATA_LO 0xc010a8UL #define UV3H_GR0_TLB_MMR_READ_DATA_LO 0xc010a8UL -#define UVH_GR0_TLB_MMR_READ_DATA_LO \ - (is_uv1_hub() ? UV1H_GR0_TLB_MMR_READ_DATA_LO : \ - (is_uv2_hub() ? UV2H_GR0_TLB_MMR_READ_DATA_LO : \ - UV3H_GR0_TLB_MMR_READ_DATA_LO)) +#define UV4H_GR0_TLB_MMR_READ_DATA_LO 0x6010a8UL +#define UVH_GR0_TLB_MMR_READ_DATA_LO ( \ + is_uv1_hub() ? UV1H_GR0_TLB_MMR_READ_DATA_LO : \ + is_uv2_hub() ? UV2H_GR0_TLB_MMR_READ_DATA_LO : \ + is_uv3_hub() ? UV3H_GR0_TLB_MMR_READ_DATA_LO : \ + /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_READ_DATA_LO) #define UVH_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 #define UVH_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 @@ -890,6 +1202,14 @@ union uvh_gr0_tlb_mmr_read_data_hi_u { #define UV3H_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL #define UV3H_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL +#define UV4H_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 +#define UV4H_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 +#define UV4H_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 +#define UV4H_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL +#define UV4H_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL +#define UV4H_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL + + union uvh_gr0_tlb_mmr_read_data_lo_u { unsigned long v; struct uvh_gr0_tlb_mmr_read_data_lo_s { @@ -917,12 +1237,25 @@ union uvh_gr0_tlb_mmr_read_data_lo_u { unsigned long asid:24; /* RO */ unsigned long valid:1; /* RO */ } s3; + struct uv4h_gr0_tlb_mmr_read_data_lo_s { + unsigned long vpn:39; /* RO */ + unsigned long asid:24; /* RO */ + unsigned long valid:1; /* RO */ + } s4; }; /* ========================================================================= */ /* UVH_GR1_TLB_INT0_CONFIG */ /* ========================================================================= */ -#define UVH_GR1_TLB_INT0_CONFIG 0x61f00UL +#define UV1H_GR1_TLB_INT0_CONFIG 0x61f00UL +#define UV2H_GR1_TLB_INT0_CONFIG 0x61f00UL +#define UV3H_GR1_TLB_INT0_CONFIG 0x61f00UL +#define UV4H_GR1_TLB_INT0_CONFIG 0x62100UL +#define UVH_GR1_TLB_INT0_CONFIG ( \ + is_uv1_hub() ? UV1H_GR1_TLB_INT0_CONFIG : \ + is_uv2_hub() ? UV2H_GR1_TLB_INT0_CONFIG : \ + is_uv3_hub() ? UV3H_GR1_TLB_INT0_CONFIG : \ + /*is_uv4_hub*/ UV4H_GR1_TLB_INT0_CONFIG) #define UVH_GR1_TLB_INT0_CONFIG_VECTOR_SHFT 0 #define UVH_GR1_TLB_INT0_CONFIG_DM_SHFT 8 @@ -941,6 +1274,7 @@ union uvh_gr0_tlb_mmr_read_data_lo_u { #define UVH_GR1_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL #define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL + union uvh_gr1_tlb_int0_config_u { unsigned long v; struct uvh_gr1_tlb_int0_config_s { @@ -960,7 +1294,15 @@ union uvh_gr1_tlb_int0_config_u { /* ========================================================================= */ /* UVH_GR1_TLB_INT1_CONFIG */ /* ========================================================================= */ -#define UVH_GR1_TLB_INT1_CONFIG 0x61f40UL +#define UV1H_GR1_TLB_INT1_CONFIG 0x61f40UL +#define UV2H_GR1_TLB_INT1_CONFIG 0x61f40UL +#define UV3H_GR1_TLB_INT1_CONFIG 0x61f40UL +#define UV4H_GR1_TLB_INT1_CONFIG 0x62140UL +#define UVH_GR1_TLB_INT1_CONFIG ( \ + is_uv1_hub() ? UV1H_GR1_TLB_INT1_CONFIG : \ + is_uv2_hub() ? UV2H_GR1_TLB_INT1_CONFIG : \ + is_uv3_hub() ? UV3H_GR1_TLB_INT1_CONFIG : \ + /*is_uv4_hub*/ UV4H_GR1_TLB_INT1_CONFIG) #define UVH_GR1_TLB_INT1_CONFIG_VECTOR_SHFT 0 #define UVH_GR1_TLB_INT1_CONFIG_DM_SHFT 8 @@ -979,6 +1321,7 @@ union uvh_gr1_tlb_int0_config_u { #define UVH_GR1_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL #define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL + union uvh_gr1_tlb_int1_config_u { unsigned long v; struct uvh_gr1_tlb_int1_config_s { @@ -1001,19 +1344,18 @@ union uvh_gr1_tlb_int1_config_u { #define UV1H_GR1_TLB_MMR_CONTROL 0x801080UL #define UV2H_GR1_TLB_MMR_CONTROL 0x1001080UL #define UV3H_GR1_TLB_MMR_CONTROL 0x1001080UL -#define UVH_GR1_TLB_MMR_CONTROL \ - (is_uv1_hub() ? UV1H_GR1_TLB_MMR_CONTROL : \ - (is_uv2_hub() ? UV2H_GR1_TLB_MMR_CONTROL : \ - UV3H_GR1_TLB_MMR_CONTROL)) +#define UV4H_GR1_TLB_MMR_CONTROL 0x701080UL +#define UVH_GR1_TLB_MMR_CONTROL ( \ + is_uv1_hub() ? UV1H_GR1_TLB_MMR_CONTROL : \ + is_uv2_hub() ? UV2H_GR1_TLB_MMR_CONTROL : \ + is_uv3_hub() ? UV3H_GR1_TLB_MMR_CONTROL : \ + /*is_uv4_hub*/ UV4H_GR1_TLB_MMR_CONTROL) #define UVH_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0 -#define UVH_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 #define UVH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 #define UVH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 #define UVH_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 #define UVH_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31 -#define UVH_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL -#define UVH_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL #define UVH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL #define UVH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL #define UVH_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL @@ -1043,14 +1385,11 @@ union uvh_gr1_tlb_int1_config_u { #define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_MASK 0x1000000000000000UL #define UVXH_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0 -#define UVXH_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 #define UVXH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 #define UVXH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 #define UVXH_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 #define UVXH_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31 #define UVXH_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32 -#define UVXH_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL -#define UVXH_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL #define UVXH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL #define UVXH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL #define UVXH_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL @@ -1093,12 +1432,30 @@ union uvh_gr1_tlb_int1_config_u { #define UV3H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL #define UV3H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL +#define UV4H_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0 +#define UV4H_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 13 +#define UV4H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 +#define UV4H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 +#define UV4H_GR1_TLB_MMR_CONTROL_ECC_SEL_SHFT 21 +#define UV4H_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 +#define UV4H_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31 +#define UV4H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32 +#define UV4H_GR1_TLB_MMR_CONTROL_PAGE_SIZE_SHFT 59 +#define UV4H_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000001fffUL +#define UV4H_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000006000UL +#define UV4H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL +#define UV4H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL +#define UV4H_GR1_TLB_MMR_CONTROL_ECC_SEL_MASK 0x0000000000200000UL +#define UV4H_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL +#define UV4H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL +#define UV4H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL +#define UV4H_GR1_TLB_MMR_CONTROL_PAGE_SIZE_MASK 0xf800000000000000UL + + union uvh_gr1_tlb_mmr_control_u { unsigned long v; struct uvh_gr1_tlb_mmr_control_s { - unsigned long index:12; /* RW */ - unsigned long mem_sel:2; /* RW */ - unsigned long rsvd_14_15:2; + unsigned long rsvd_0_15:16; unsigned long auto_valid_en:1; /* RW */ unsigned long rsvd_17_19:3; unsigned long mmr_hash_index_en:1; /* RW */ @@ -1132,9 +1489,7 @@ union uvh_gr1_tlb_mmr_control_u { unsigned long rsvd_61_63:3; } s1; struct uvxh_gr1_tlb_mmr_control_s { - unsigned long index:12; /* RW */ - unsigned long mem_sel:2; /* RW */ - unsigned long rsvd_14_15:2; + unsigned long rsvd_0_15:16; unsigned long auto_valid_en:1; /* RW */ unsigned long rsvd_17_19:3; unsigned long mmr_hash_index_en:1; /* RW */ @@ -1145,8 +1500,7 @@ union uvh_gr1_tlb_mmr_control_u { unsigned long rsvd_33_47:15; unsigned long rsvd_48:1; unsigned long rsvd_49_51:3; - unsigned long rsvd_52:1; - unsigned long rsvd_53_63:11; + unsigned long rsvd_52_63:12; } sx; struct uv2h_gr1_tlb_mmr_control_s { unsigned long index:12; /* RW */ @@ -1183,6 +1537,24 @@ union uvh_gr1_tlb_mmr_control_u { unsigned long undef_52:1; /* Undefined */ unsigned long rsvd_53_63:11; } s3; + struct uv4h_gr1_tlb_mmr_control_s { + unsigned long index:13; /* RW */ + unsigned long mem_sel:2; /* RW */ + unsigned long rsvd_15:1; + unsigned long auto_valid_en:1; /* RW */ + unsigned long rsvd_17_19:3; + unsigned long mmr_hash_index_en:1; /* RW */ + unsigned long ecc_sel:1; /* RW */ + unsigned long rsvd_22_29:8; + unsigned long mmr_write:1; /* WP */ + unsigned long mmr_read:1; /* WP */ + unsigned long mmr_op_done:1; /* RW */ + unsigned long rsvd_33_47:15; + unsigned long undef_48:1; /* Undefined */ + unsigned long rsvd_49_51:3; + unsigned long rsvd_52_58:7; + unsigned long page_size:5; /* RW */ + } s4; }; /* ========================================================================= */ @@ -1191,19 +1563,14 @@ union uvh_gr1_tlb_mmr_control_u { #define UV1H_GR1_TLB_MMR_READ_DATA_HI 0x8010a0UL #define UV2H_GR1_TLB_MMR_READ_DATA_HI 0x10010a0UL #define UV3H_GR1_TLB_MMR_READ_DATA_HI 0x10010a0UL -#define UVH_GR1_TLB_MMR_READ_DATA_HI \ - (is_uv1_hub() ? UV1H_GR1_TLB_MMR_READ_DATA_HI : \ - (is_uv2_hub() ? UV2H_GR1_TLB_MMR_READ_DATA_HI : \ - UV3H_GR1_TLB_MMR_READ_DATA_HI)) +#define UV4H_GR1_TLB_MMR_READ_DATA_HI 0x7010a0UL +#define UVH_GR1_TLB_MMR_READ_DATA_HI ( \ + is_uv1_hub() ? UV1H_GR1_TLB_MMR_READ_DATA_HI : \ + is_uv2_hub() ? UV2H_GR1_TLB_MMR_READ_DATA_HI : \ + is_uv3_hub() ? UV3H_GR1_TLB_MMR_READ_DATA_HI : \ + /*is_uv4_hub*/ UV4H_GR1_TLB_MMR_READ_DATA_HI) #define UVH_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 -#define UVH_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 -#define UVH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 -#define UVH_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 -#define UVH_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL -#define UVH_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL -#define UVH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL -#define UVH_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL #define UV1H_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 #define UV1H_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 @@ -1215,13 +1582,6 @@ union uvh_gr1_tlb_mmr_control_u { #define UV1H_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL #define UVXH_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 -#define UVXH_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 -#define UVXH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 -#define UVXH_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 -#define UVXH_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL -#define UVXH_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL -#define UVXH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL -#define UVXH_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL #define UV2H_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 #define UV2H_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 @@ -1245,15 +1605,24 @@ union uvh_gr1_tlb_mmr_control_u { #define UV3H_GR1_TLB_MMR_READ_DATA_HI_AA_EXT_MASK 0x0000200000000000UL #define UV3H_GR1_TLB_MMR_READ_DATA_HI_WAY_ECC_MASK 0xff80000000000000UL +#define UV4H_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 +#define UV4H_GR1_TLB_MMR_READ_DATA_HI_PNID_SHFT 34 +#define UV4H_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 49 +#define UV4H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 51 +#define UV4H_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT 52 +#define UV4H_GR1_TLB_MMR_READ_DATA_HI_AA_EXT_SHFT 53 +#define UV4H_GR1_TLB_MMR_READ_DATA_HI_WAY_ECC_SHFT 55 +#define UV4H_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK 0x00000003ffffffffUL +#define UV4H_GR1_TLB_MMR_READ_DATA_HI_PNID_MASK 0x0001fffc00000000UL +#define UV4H_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0006000000000000UL +#define UV4H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0008000000000000UL +#define UV4H_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0010000000000000UL +#define UV4H_GR1_TLB_MMR_READ_DATA_HI_AA_EXT_MASK 0x0020000000000000UL +#define UV4H_GR1_TLB_MMR_READ_DATA_HI_WAY_ECC_MASK 0xff80000000000000UL + + union uvh_gr1_tlb_mmr_read_data_hi_u { unsigned long v; - struct uvh_gr1_tlb_mmr_read_data_hi_s { - unsigned long pfn:41; /* RO */ - unsigned long gaa:2; /* RO */ - unsigned long dirty:1; /* RO */ - unsigned long larger:1; /* RO */ - unsigned long rsvd_45_63:19; - } s; struct uv1h_gr1_tlb_mmr_read_data_hi_s { unsigned long pfn:41; /* RO */ unsigned long gaa:2; /* RO */ @@ -1261,13 +1630,6 @@ union uvh_gr1_tlb_mmr_read_data_hi_u { unsigned long larger:1; /* RO */ unsigned long rsvd_45_63:19; } s1; - struct uvxh_gr1_tlb_mmr_read_data_hi_s { - unsigned long pfn:41; /* RO */ - unsigned long gaa:2; /* RO */ - unsigned long dirty:1; /* RO */ - unsigned long larger:1; /* RO */ - unsigned long rsvd_45_63:19; - } sx; struct uv2h_gr1_tlb_mmr_read_data_hi_s { unsigned long pfn:41; /* RO */ unsigned long gaa:2; /* RO */ @@ -1284,6 +1646,16 @@ union uvh_gr1_tlb_mmr_read_data_hi_u { unsigned long undef_46_54:9; /* Undefined */ unsigned long way_ecc:9; /* RO */ } s3; + struct uv4h_gr1_tlb_mmr_read_data_hi_s { + unsigned long pfn:34; /* RO */ + unsigned long pnid:15; /* RO */ + unsigned long gaa:2; /* RO */ + unsigned long dirty:1; /* RO */ + unsigned long larger:1; /* RO */ + unsigned long aa_ext:1; /* RO */ + unsigned long undef_54:1; /* Undefined */ + unsigned long way_ecc:9; /* RO */ + } s4; }; /* ========================================================================= */ @@ -1292,10 +1664,12 @@ union uvh_gr1_tlb_mmr_read_data_hi_u { #define UV1H_GR1_TLB_MMR_READ_DATA_LO 0x8010a8UL #define UV2H_GR1_TLB_MMR_READ_DATA_LO 0x10010a8UL #define UV3H_GR1_TLB_MMR_READ_DATA_LO 0x10010a8UL -#define UVH_GR1_TLB_MMR_READ_DATA_LO \ - (is_uv1_hub() ? UV1H_GR1_TLB_MMR_READ_DATA_LO : \ - (is_uv2_hub() ? UV2H_GR1_TLB_MMR_READ_DATA_LO : \ - UV3H_GR1_TLB_MMR_READ_DATA_LO)) +#define UV4H_GR1_TLB_MMR_READ_DATA_LO 0x7010a8UL +#define UVH_GR1_TLB_MMR_READ_DATA_LO ( \ + is_uv1_hub() ? UV1H_GR1_TLB_MMR_READ_DATA_LO : \ + is_uv2_hub() ? UV2H_GR1_TLB_MMR_READ_DATA_LO : \ + is_uv3_hub() ? UV3H_GR1_TLB_MMR_READ_DATA_LO : \ + /*is_uv4_hub*/ UV4H_GR1_TLB_MMR_READ_DATA_LO) #define UVH_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 #define UVH_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 @@ -1332,6 +1706,14 @@ union uvh_gr1_tlb_mmr_read_data_hi_u { #define UV3H_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL #define UV3H_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL +#define UV4H_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 +#define UV4H_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 +#define UV4H_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 +#define UV4H_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL +#define UV4H_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL +#define UV4H_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL + + union uvh_gr1_tlb_mmr_read_data_lo_u { unsigned long v; struct uvh_gr1_tlb_mmr_read_data_lo_s { @@ -1359,6 +1741,11 @@ union uvh_gr1_tlb_mmr_read_data_lo_u { unsigned long asid:24; /* RO */ unsigned long valid:1; /* RO */ } s3; + struct uv4h_gr1_tlb_mmr_read_data_lo_s { + unsigned long vpn:39; /* RO */ + unsigned long asid:24; /* RO */ + unsigned long valid:1; /* RO */ + } s4; }; /* ========================================================================= */ @@ -1369,6 +1756,7 @@ union uvh_gr1_tlb_mmr_read_data_lo_u { #define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT 0 #define UVH_INT_CMPB_REAL_TIME_CMPB_MASK 0x00ffffffffffffffUL + union uvh_int_cmpb_u { unsigned long v; struct uvh_int_cmpb_s { @@ -1382,12 +1770,14 @@ union uvh_int_cmpb_u { /* ========================================================================= */ #define UVH_INT_CMPC 0x22100UL + #define UV1H_INT_CMPC_REAL_TIME_CMPC_SHFT 0 #define UV1H_INT_CMPC_REAL_TIME_CMPC_MASK 0x00ffffffffffffffUL #define UVXH_INT_CMPC_REAL_TIME_CMP_2_SHFT 0 #define UVXH_INT_CMPC_REAL_TIME_CMP_2_MASK 0x00ffffffffffffffUL + union uvh_int_cmpc_u { unsigned long v; struct uvh_int_cmpc_s { @@ -1401,12 +1791,14 @@ union uvh_int_cmpc_u { /* ========================================================================= */ #define UVH_INT_CMPD 0x22180UL + #define UV1H_INT_CMPD_REAL_TIME_CMPD_SHFT 0 #define UV1H_INT_CMPD_REAL_TIME_CMPD_MASK 0x00ffffffffffffffUL #define UVXH_INT_CMPD_REAL_TIME_CMP_3_SHFT 0 #define UVXH_INT_CMPD_REAL_TIME_CMP_3_MASK 0x00ffffffffffffffUL + union uvh_int_cmpd_u { unsigned long v; struct uvh_int_cmpd_s { @@ -1419,7 +1811,16 @@ union uvh_int_cmpd_u { /* UVH_IPI_INT */ /* ========================================================================= */ #define UVH_IPI_INT 0x60500UL -#define UVH_IPI_INT_32 0x348 + +#define UV1H_IPI_INT_32 0x348 +#define UV2H_IPI_INT_32 0x348 +#define UV3H_IPI_INT_32 0x348 +#define UV4H_IPI_INT_32 0x268 +#define UVH_IPI_INT_32 ( \ + is_uv1_hub() ? UV1H_IPI_INT_32 : \ + is_uv2_hub() ? UV2H_IPI_INT_32 : \ + is_uv3_hub() ? UV3H_IPI_INT_32 : \ + /*is_uv4_hub*/ UV4H_IPI_INT_32) #define UVH_IPI_INT_VECTOR_SHFT 0 #define UVH_IPI_INT_DELIVERY_MODE_SHFT 8 @@ -1432,6 +1833,7 @@ union uvh_int_cmpd_u { #define UVH_IPI_INT_APIC_ID_MASK 0x0000ffffffff0000UL #define UVH_IPI_INT_SEND_MASK 0x8000000000000000UL + union uvh_ipi_int_u { unsigned long v; struct uvh_ipi_int_s { @@ -1448,103 +1850,269 @@ union uvh_ipi_int_u { /* ========================================================================= */ /* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST */ /* ========================================================================= */ -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL +#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL +#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL +#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL +#define UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST uv_undefined("UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST") +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST ( \ + is_uv1_hub() ? UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST : \ + is_uv2_hub() ? UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST : \ + is_uv3_hub() ? UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST : \ + /*is_uv4_hub*/ UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST) #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x9c0 -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4 -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49 -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL + +#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4 +#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49 +#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL +#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL + + +#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4 +#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49 +#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL +#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL + +#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4 +#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49 +#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL +#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL + union uvh_lb_bau_intd_payload_queue_first_u { unsigned long v; - struct uvh_lb_bau_intd_payload_queue_first_s { + struct uv1h_lb_bau_intd_payload_queue_first_s { unsigned long rsvd_0_3:4; unsigned long address:39; /* RW */ unsigned long rsvd_43_48:6; unsigned long node_id:14; /* RW */ unsigned long rsvd_63:1; - } s; + } s1; + struct uv2h_lb_bau_intd_payload_queue_first_s { + unsigned long rsvd_0_3:4; + unsigned long address:39; /* RW */ + unsigned long rsvd_43_48:6; + unsigned long node_id:14; /* RW */ + unsigned long rsvd_63:1; + } s2; + struct uv3h_lb_bau_intd_payload_queue_first_s { + unsigned long rsvd_0_3:4; + unsigned long address:39; /* RW */ + unsigned long rsvd_43_48:6; + unsigned long node_id:14; /* RW */ + unsigned long rsvd_63:1; + } s3; }; /* ========================================================================= */ /* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST */ /* ========================================================================= */ -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL +#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL +#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL +#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL +#define UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST uv_undefined("UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST") +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST ( \ + is_uv1_hub() ? UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST : \ + is_uv2_hub() ? UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST : \ + is_uv3_hub() ? UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST : \ + /*is_uv4_hub*/ UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST) #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x9c8 -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4 -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL + +#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4 +#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL + + +#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4 +#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL + +#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4 +#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL + union uvh_lb_bau_intd_payload_queue_last_u { unsigned long v; - struct uvh_lb_bau_intd_payload_queue_last_s { + struct uv1h_lb_bau_intd_payload_queue_last_s { unsigned long rsvd_0_3:4; unsigned long address:39; /* RW */ unsigned long rsvd_43_63:21; - } s; + } s1; + struct uv2h_lb_bau_intd_payload_queue_last_s { + unsigned long rsvd_0_3:4; + unsigned long address:39; /* RW */ + unsigned long rsvd_43_63:21; + } s2; + struct uv3h_lb_bau_intd_payload_queue_last_s { + unsigned long rsvd_0_3:4; + unsigned long address:39; /* RW */ + unsigned long rsvd_43_63:21; + } s3; }; /* ========================================================================= */ /* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL */ /* ========================================================================= */ -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL +#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL +#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL +#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL +#define UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL uv_undefined("UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL") +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL ( \ + is_uv1_hub() ? UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL : \ + is_uv2_hub() ? UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL : \ + is_uv3_hub() ? UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL : \ + /*is_uv4_hub*/ UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL) #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x9d0 -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4 -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL + +#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4 +#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL + + +#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4 +#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL + +#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4 +#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL + union uvh_lb_bau_intd_payload_queue_tail_u { unsigned long v; - struct uvh_lb_bau_intd_payload_queue_tail_s { + struct uv1h_lb_bau_intd_payload_queue_tail_s { unsigned long rsvd_0_3:4; unsigned long address:39; /* RW */ unsigned long rsvd_43_63:21; - } s; + } s1; + struct uv2h_lb_bau_intd_payload_queue_tail_s { + unsigned long rsvd_0_3:4; + unsigned long address:39; /* RW */ + unsigned long rsvd_43_63:21; + } s2; + struct uv3h_lb_bau_intd_payload_queue_tail_s { + unsigned long rsvd_0_3:4; + unsigned long address:39; /* RW */ + unsigned long rsvd_43_63:21; + } s3; }; /* ========================================================================= */ /* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE */ /* ========================================================================= */ -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL +#define UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE uv_undefined("UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE") +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE ( \ + is_uv1_hub() ? UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE : \ + is_uv2_hub() ? UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE : \ + is_uv3_hub() ? UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE : \ + /*is_uv4_hub*/ UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE) #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0xa68 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL + +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15 +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL + + +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15 +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL + +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15 +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL + union uvh_lb_bau_intd_software_acknowledge_u { unsigned long v; - struct uvh_lb_bau_intd_software_acknowledge_s { + struct uv1h_lb_bau_intd_software_acknowledge_s { unsigned long pending_0:1; /* RW, W1C */ unsigned long pending_1:1; /* RW, W1C */ unsigned long pending_2:1; /* RW, W1C */ @@ -1562,27 +2130,84 @@ union uvh_lb_bau_intd_software_acknowledge_u { unsigned long timeout_6:1; /* RW, W1C */ unsigned long timeout_7:1; /* RW, W1C */ unsigned long rsvd_16_63:48; - } s; + } s1; + struct uv2h_lb_bau_intd_software_acknowledge_s { + unsigned long pending_0:1; /* RW */ + unsigned long pending_1:1; /* RW */ + unsigned long pending_2:1; /* RW */ + unsigned long pending_3:1; /* RW */ + unsigned long pending_4:1; /* RW */ + unsigned long pending_5:1; /* RW */ + unsigned long pending_6:1; /* RW */ + unsigned long pending_7:1; /* RW */ + unsigned long timeout_0:1; /* RW */ + unsigned long timeout_1:1; /* RW */ + unsigned long timeout_2:1; /* RW */ + unsigned long timeout_3:1; /* RW */ + unsigned long timeout_4:1; /* RW */ + unsigned long timeout_5:1; /* RW */ + unsigned long timeout_6:1; /* RW */ + unsigned long timeout_7:1; /* RW */ + unsigned long rsvd_16_63:48; + } s2; + struct uv3h_lb_bau_intd_software_acknowledge_s { + unsigned long pending_0:1; /* RW */ + unsigned long pending_1:1; /* RW */ + unsigned long pending_2:1; /* RW */ + unsigned long pending_3:1; /* RW */ + unsigned long pending_4:1; /* RW */ + unsigned long pending_5:1; /* RW */ + unsigned long pending_6:1; /* RW */ + unsigned long pending_7:1; /* RW */ + unsigned long timeout_0:1; /* RW */ + unsigned long timeout_1:1; /* RW */ + unsigned long timeout_2:1; /* RW */ + unsigned long timeout_3:1; /* RW */ + unsigned long timeout_4:1; /* RW */ + unsigned long timeout_5:1; /* RW */ + unsigned long timeout_6:1; /* RW */ + unsigned long timeout_7:1; /* RW */ + unsigned long rsvd_16_63:48; + } s3; }; /* ========================================================================= */ /* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS */ /* ========================================================================= */ -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x320088UL +#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x320088UL +#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x320088UL +#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x320088UL +#define UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS uv_undefined("UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS") +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS ( \ + is_uv1_hub() ? UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS : \ + is_uv2_hub() ? UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS : \ + is_uv3_hub() ? UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS : \ + /*is_uv4_hub*/ UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS) #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0xa70 /* ========================================================================= */ /* UVH_LB_BAU_MISC_CONTROL */ /* ========================================================================= */ -#define UVH_LB_BAU_MISC_CONTROL 0x320170UL #define UV1H_LB_BAU_MISC_CONTROL 0x320170UL #define UV2H_LB_BAU_MISC_CONTROL 0x320170UL #define UV3H_LB_BAU_MISC_CONTROL 0x320170UL -#define UVH_LB_BAU_MISC_CONTROL_32 0xa10 -#define UV1H_LB_BAU_MISC_CONTROL_32 0x320170UL -#define UV2H_LB_BAU_MISC_CONTROL_32 0x320170UL -#define UV3H_LB_BAU_MISC_CONTROL_32 0x320170UL +#define UV4H_LB_BAU_MISC_CONTROL 0xc8170UL +#define UVH_LB_BAU_MISC_CONTROL ( \ + is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL : \ + is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL : \ + is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL : \ + /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL) + +#define UV1H_LB_BAU_MISC_CONTROL_32 0xa10 +#define UV2H_LB_BAU_MISC_CONTROL_32 0xa10 +#define UV3H_LB_BAU_MISC_CONTROL_32 0xa10 +#define UV4H_LB_BAU_MISC_CONTROL_32 0xa18 +#define UVH_LB_BAU_MISC_CONTROL_32 ( \ + is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL_32 : \ + is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_32 : \ + is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_32 : \ + /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_32) #define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 #define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 @@ -1590,8 +2215,6 @@ union uvh_lb_bau_intd_software_acknowledge_u { #define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 #define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11 #define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 -#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15 -#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16 #define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20 #define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21 #define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22 @@ -1606,8 +2229,6 @@ union uvh_lb_bau_intd_software_acknowledge_u { #define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL #define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL #define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL -#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL -#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL #define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL #define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL #define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL @@ -1656,8 +2277,6 @@ union uvh_lb_bau_intd_software_acknowledge_u { #define UVXH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 #define UVXH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11 #define UVXH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 -#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15 -#define UVXH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16 #define UVXH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20 #define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21 #define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22 @@ -1679,8 +2298,6 @@ union uvh_lb_bau_intd_software_acknowledge_u { #define UVXH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL #define UVXH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL #define UVXH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL -#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL -#define UVXH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL #define UVXH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL #define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL #define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL @@ -1797,6 +2414,88 @@ union uvh_lb_bau_intd_software_acknowledge_u { #define UV3H_LB_BAU_MISC_CONTROL_THREAD_KILL_TIMEBASE_MASK 0x00003fc000000000UL #define UV3H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL +#define UV4H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 +#define UV4H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 +#define UV4H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9 +#define UV4H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 +#define UV4H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11 +#define UV4H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 +#define UV4H_LB_BAU_MISC_CONTROL_RESERVED_15_19_SHFT 15 +#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20 +#define UV4H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21 +#define UV4H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22 +#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23 +#define UV4H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24 +#define UV4H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27 +#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28 +#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_SHFT 29 +#define UV4H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT 30 +#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_SHFT 31 +#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_SHFT 32 +#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 33 +#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_SHFT 34 +#define UV4H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT 35 +#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_QUIESCE_MSGS_TO_QPI_SHFT 36 +#define UV4H_LB_BAU_MISC_CONTROL_RESERVED_37_SHFT 37 +#define UV4H_LB_BAU_MISC_CONTROL_THREAD_KILL_TIMEBASE_SHFT 38 +#define UV4H_LB_BAU_MISC_CONTROL_ADDRESS_INTERLEAVE_SELECT_SHFT 46 +#define UV4H_LB_BAU_MISC_CONTROL_FUN_SHFT 48 +#define UV4H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL +#define UV4H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL +#define UV4H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL +#define UV4H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL +#define UV4H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL +#define UV4H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL +#define UV4H_LB_BAU_MISC_CONTROL_RESERVED_15_19_MASK 0x00000000000f8000UL +#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL +#define UV4H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL +#define UV4H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL +#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL +#define UV4H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL +#define UV4H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL +#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL +#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL +#define UV4H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK 0x0000000040000000UL +#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL +#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL +#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL +#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL +#define UV4H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL +#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_QUIESCE_MSGS_TO_QPI_MASK 0x0000001000000000UL +#define UV4H_LB_BAU_MISC_CONTROL_RESERVED_37_MASK 0x0000002000000000UL +#define UV4H_LB_BAU_MISC_CONTROL_THREAD_KILL_TIMEBASE_MASK 0x00003fc000000000UL +#define UV4H_LB_BAU_MISC_CONTROL_ADDRESS_INTERLEAVE_SELECT_MASK 0x0000400000000000UL +#define UV4H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL + +#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK \ + uv_undefined("UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK") +#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK ( \ + is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK : \ + is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK : \ + is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK : \ + /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK) +#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT \ + uv_undefined("UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT") +#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT ( \ + is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT : \ + is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT : \ + is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT : \ + /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT) +#define UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK \ + uv_undefined("UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK") +#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK ( \ + is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK : \ + is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK : \ + is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK : \ + /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK) +#define UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT \ + uv_undefined("UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT") +#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT ( \ + is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT : \ + is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT : \ + is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT : \ + /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT) + union uvh_lb_bau_misc_control_u { unsigned long v; struct uvh_lb_bau_misc_control_s { @@ -1806,8 +2505,7 @@ union uvh_lb_bau_misc_control_u { unsigned long force_lock_nop:1; /* RW */ unsigned long qpi_agent_presence_vector:3; /* RW */ unsigned long descriptor_fetch_mode:1; /* RW */ - unsigned long enable_intd_soft_ack_mode:1; /* RW */ - unsigned long intd_soft_ack_timeout_period:4; /* RW */ + unsigned long rsvd_15_19:5; unsigned long enable_dual_mapping_mode:1; /* RW */ unsigned long vga_io_port_decode_enable:1; /* RW */ unsigned long vga_io_port_16_bit_decode:1; /* RW */ @@ -1844,8 +2542,7 @@ union uvh_lb_bau_misc_control_u { unsigned long force_lock_nop:1; /* RW */ unsigned long qpi_agent_presence_vector:3; /* RW */ unsigned long descriptor_fetch_mode:1; /* RW */ - unsigned long enable_intd_soft_ack_mode:1; /* RW */ - unsigned long intd_soft_ack_timeout_period:4; /* RW */ + unsigned long rsvd_15_19:5; unsigned long enable_dual_mapping_mode:1; /* RW */ unsigned long vga_io_port_decode_enable:1; /* RW */ unsigned long vga_io_port_16_bit_decode:1; /* RW */ @@ -1918,13 +2615,59 @@ union uvh_lb_bau_misc_control_u { unsigned long rsvd_46_47:2; unsigned long fun:16; /* RW */ } s3; + struct uv4h_lb_bau_misc_control_s { + unsigned long rejection_delay:8; /* RW */ + unsigned long apic_mode:1; /* RW */ + unsigned long force_broadcast:1; /* RW */ + unsigned long force_lock_nop:1; /* RW */ + unsigned long qpi_agent_presence_vector:3; /* RW */ + unsigned long descriptor_fetch_mode:1; /* RW */ + unsigned long rsvd_15_19:5; + unsigned long enable_dual_mapping_mode:1; /* RW */ + unsigned long vga_io_port_decode_enable:1; /* RW */ + unsigned long vga_io_port_16_bit_decode:1; /* RW */ + unsigned long suppress_dest_registration:1; /* RW */ + unsigned long programmed_initial_priority:3; /* RW */ + unsigned long use_incoming_priority:1; /* RW */ + unsigned long enable_programmed_initial_priority:1;/* RW */ + unsigned long enable_automatic_apic_mode_selection:1;/* RW */ + unsigned long apic_mode_status:1; /* RO */ + unsigned long suppress_interrupts_to_self:1; /* RW */ + unsigned long enable_lock_based_system_flush:1;/* RW */ + unsigned long enable_extended_sb_status:1; /* RW */ + unsigned long suppress_int_prio_udt_to_self:1;/* RW */ + unsigned long use_legacy_descriptor_formats:1;/* RW */ + unsigned long suppress_quiesce_msgs_to_qpi:1; /* RW */ + unsigned long rsvd_37:1; + unsigned long thread_kill_timebase:8; /* RW */ + unsigned long address_interleave_select:1; /* RW */ + unsigned long rsvd_47:1; + unsigned long fun:16; /* RW */ + } s4; }; /* ========================================================================= */ /* UVH_LB_BAU_SB_ACTIVATION_CONTROL */ /* ========================================================================= */ -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8 +#define UV1H_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL +#define UV2H_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL +#define UV3H_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL +#define UV4H_LB_BAU_SB_ACTIVATION_CONTROL 0xc8020UL +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL ( \ + is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_CONTROL : \ + is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_CONTROL : \ + is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_CONTROL : \ + /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_CONTROL) + +#define UV1H_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8 +#define UV2H_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8 +#define UV3H_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8 +#define UV4H_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9c8 +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 ( \ + is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_CONTROL_32 : \ + is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_CONTROL_32 : \ + is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_CONTROL_32 : \ + /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_CONTROL_32) #define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0 #define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT 62 @@ -1933,6 +2676,7 @@ union uvh_lb_bau_misc_control_u { #define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_MASK 0x4000000000000000UL #define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_MASK 0x8000000000000000UL + union uvh_lb_bau_sb_activation_control_u { unsigned long v; struct uvh_lb_bau_sb_activation_control_s { @@ -1946,12 +2690,30 @@ union uvh_lb_bau_sb_activation_control_u { /* ========================================================================= */ /* UVH_LB_BAU_SB_ACTIVATION_STATUS_0 */ /* ========================================================================= */ -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0 +#define UV1H_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL +#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL +#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL +#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_0 0xc8030UL +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 ( \ + is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_STATUS_0 : \ + is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_0 : \ + is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_0 : \ + /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_0) + +#define UV1H_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0 +#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0 +#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0 +#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9d0 +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 ( \ + is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_STATUS_0_32 : \ + is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_0_32 : \ + is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_0_32 : \ + /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_0_32) #define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL + union uvh_lb_bau_sb_activation_status_0_u { unsigned long v; struct uvh_lb_bau_sb_activation_status_0_s { @@ -1962,12 +2724,30 @@ union uvh_lb_bau_sb_activation_status_0_u { /* ========================================================================= */ /* UVH_LB_BAU_SB_ACTIVATION_STATUS_1 */ /* ========================================================================= */ -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8 +#define UV1H_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL +#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL +#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL +#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_1 0xc8040UL +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 ( \ + is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_STATUS_1 : \ + is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_1 : \ + is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_1 : \ + /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_1) + +#define UV1H_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8 +#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8 +#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8 +#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9d8 +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 ( \ + is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_STATUS_1_32 : \ + is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_1_32 : \ + is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_1_32 : \ + /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_1_32) #define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL + union uvh_lb_bau_sb_activation_status_1_u { unsigned long v; struct uvh_lb_bau_sb_activation_status_1_s { @@ -1978,23 +2758,55 @@ union uvh_lb_bau_sb_activation_status_1_u { /* ========================================================================= */ /* UVH_LB_BAU_SB_DESCRIPTOR_BASE */ /* ========================================================================= */ -#define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL -#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0 +#define UV1H_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL +#define UV2H_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL +#define UV3H_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL +#define UV4H_LB_BAU_SB_DESCRIPTOR_BASE 0xc8010UL +#define UVH_LB_BAU_SB_DESCRIPTOR_BASE ( \ + is_uv1_hub() ? UV1H_LB_BAU_SB_DESCRIPTOR_BASE : \ + is_uv2_hub() ? UV2H_LB_BAU_SB_DESCRIPTOR_BASE : \ + is_uv3_hub() ? UV3H_LB_BAU_SB_DESCRIPTOR_BASE : \ + /*is_uv4_hub*/ UV4H_LB_BAU_SB_DESCRIPTOR_BASE) + +#define UV1H_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0 +#define UV2H_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0 +#define UV3H_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0 +#define UV4H_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9c0 +#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 ( \ + is_uv1_hub() ? UV1H_LB_BAU_SB_DESCRIPTOR_BASE_32 : \ + is_uv2_hub() ? UV2H_LB_BAU_SB_DESCRIPTOR_BASE_32 : \ + is_uv3_hub() ? UV3H_LB_BAU_SB_DESCRIPTOR_BASE_32 : \ + /*is_uv4_hub*/ UV4H_LB_BAU_SB_DESCRIPTOR_BASE_32) #define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12 #define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49 -#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL #define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL +#define UV1H_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL + + +#define UV2H_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL + +#define UV3H_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL + +#define UV4H_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x00003ffffffff000UL + + union uvh_lb_bau_sb_descriptor_base_u { unsigned long v; struct uvh_lb_bau_sb_descriptor_base_s { unsigned long rsvd_0_11:12; - unsigned long page_address:31; /* RW */ - unsigned long rsvd_43_48:6; + unsigned long rsvd_12_48:37; unsigned long node_id:14; /* RW */ unsigned long rsvd_63:1; } s; + struct uv4h_lb_bau_sb_descriptor_base_s { + unsigned long rsvd_0_11:12; + unsigned long page_address:34; /* RW */ + unsigned long rsvd_46_48:3; + unsigned long node_id:14; /* RW */ + unsigned long rsvd_63:1; + } s4; }; /* ========================================================================= */ @@ -2004,6 +2816,7 @@ union uvh_lb_bau_sb_descriptor_base_u { #define UV1H_NODE_ID 0x0UL #define UV2H_NODE_ID 0x0UL #define UV3H_NODE_ID 0x0UL +#define UV4H_NODE_ID 0x0UL #define UVH_NODE_ID_FORCE1_SHFT 0 #define UVH_NODE_ID_MANUFACTURER_SHFT 1 @@ -2080,6 +2893,26 @@ union uvh_lb_bau_sb_descriptor_base_u { #define UV3H_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL #define UV3H_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL +#define UV4H_NODE_ID_FORCE1_SHFT 0 +#define UV4H_NODE_ID_MANUFACTURER_SHFT 1 +#define UV4H_NODE_ID_PART_NUMBER_SHFT 12 +#define UV4H_NODE_ID_REVISION_SHFT 28 +#define UV4H_NODE_ID_NODE_ID_SHFT 32 +#define UV4H_NODE_ID_ROUTER_SELECT_SHFT 48 +#define UV4H_NODE_ID_RESERVED_2_SHFT 49 +#define UV4H_NODE_ID_NODES_PER_BIT_SHFT 50 +#define UV4H_NODE_ID_NI_PORT_SHFT 57 +#define UV4H_NODE_ID_FORCE1_MASK 0x0000000000000001UL +#define UV4H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL +#define UV4H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL +#define UV4H_NODE_ID_REVISION_MASK 0x00000000f0000000UL +#define UV4H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL +#define UV4H_NODE_ID_ROUTER_SELECT_MASK 0x0001000000000000UL +#define UV4H_NODE_ID_RESERVED_2_MASK 0x0002000000000000UL +#define UV4H_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL +#define UV4H_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL + + union uvh_node_id_u { unsigned long v; struct uvh_node_id_s { @@ -2137,17 +2970,40 @@ union uvh_node_id_u { unsigned long ni_port:5; /* RO */ unsigned long rsvd_62_63:2; } s3; + struct uv4h_node_id_s { + unsigned long force1:1; /* RO */ + unsigned long manufacturer:11; /* RO */ + unsigned long part_number:16; /* RO */ + unsigned long revision:4; /* RO */ + unsigned long node_id:15; /* RW */ + unsigned long rsvd_47:1; + unsigned long router_select:1; /* RO */ + unsigned long rsvd_49:1; + unsigned long nodes_per_bit:7; /* RO */ + unsigned long ni_port:5; /* RO */ + unsigned long rsvd_62_63:2; + } s4; }; /* ========================================================================= */ /* UVH_NODE_PRESENT_TABLE */ /* ========================================================================= */ #define UVH_NODE_PRESENT_TABLE 0x1400UL -#define UVH_NODE_PRESENT_TABLE_DEPTH 16 + +#define UV1H_NODE_PRESENT_TABLE_DEPTH 16 +#define UV2H_NODE_PRESENT_TABLE_DEPTH 16 +#define UV3H_NODE_PRESENT_TABLE_DEPTH 16 +#define UV4H_NODE_PRESENT_TABLE_DEPTH 4 +#define UVH_NODE_PRESENT_TABLE_DEPTH ( \ + is_uv1_hub() ? UV1H_NODE_PRESENT_TABLE_DEPTH : \ + is_uv2_hub() ? UV2H_NODE_PRESENT_TABLE_DEPTH : \ + is_uv3_hub() ? UV3H_NODE_PRESENT_TABLE_DEPTH : \ + /*is_uv4_hub*/ UV4H_NODE_PRESENT_TABLE_DEPTH) #define UVH_NODE_PRESENT_TABLE_NODES_SHFT 0 #define UVH_NODE_PRESENT_TABLE_NODES_MASK 0xffffffffffffffffUL + union uvh_node_present_table_u { unsigned long v; struct uvh_node_present_table_s { @@ -2158,7 +3014,15 @@ union uvh_node_present_table_u { /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL +#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL +#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL +#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL +#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x4800c8UL +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR ( \ + is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR : \ + is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR : \ + is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR : \ + /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR) #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48 @@ -2167,6 +3031,7 @@ union uvh_node_present_table_u { #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL + union uvh_rh_gam_alias210_overlay_config_0_mmr_u { unsigned long v; struct uvh_rh_gam_alias210_overlay_config_0_mmr_s { @@ -2182,7 +3047,15 @@ union uvh_rh_gam_alias210_overlay_config_0_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL +#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL +#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL +#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL +#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x4800d8UL +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR ( \ + is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR : \ + is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR : \ + is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR : \ + /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR) #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48 @@ -2191,6 +3064,7 @@ union uvh_rh_gam_alias210_overlay_config_0_mmr_u { #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL + union uvh_rh_gam_alias210_overlay_config_1_mmr_u { unsigned long v; struct uvh_rh_gam_alias210_overlay_config_1_mmr_s { @@ -2206,7 +3080,15 @@ union uvh_rh_gam_alias210_overlay_config_1_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL +#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL +#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL +#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL +#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x4800e8UL +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR ( \ + is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR : \ + is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR : \ + is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR : \ + /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR) #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48 @@ -2215,6 +3097,7 @@ union uvh_rh_gam_alias210_overlay_config_1_mmr_u { #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL + union uvh_rh_gam_alias210_overlay_config_2_mmr_u { unsigned long v; struct uvh_rh_gam_alias210_overlay_config_2_mmr_s { @@ -2230,11 +3113,20 @@ union uvh_rh_gam_alias210_overlay_config_2_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL +#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL +#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL +#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL +#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x4800d0UL +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR ( \ + is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR : \ + is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR : \ + is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR : \ + /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR) #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL + union uvh_rh_gam_alias210_redirect_config_0_mmr_u { unsigned long v; struct uvh_rh_gam_alias210_redirect_config_0_mmr_s { @@ -2247,11 +3139,20 @@ union uvh_rh_gam_alias210_redirect_config_0_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL +#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL +#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL +#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL +#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x4800e0UL +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR ( \ + is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR : \ + is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR : \ + is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR : \ + /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR) #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL + union uvh_rh_gam_alias210_redirect_config_1_mmr_u { unsigned long v; struct uvh_rh_gam_alias210_redirect_config_1_mmr_s { @@ -2264,11 +3165,20 @@ union uvh_rh_gam_alias210_redirect_config_1_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL +#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL +#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL +#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL +#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x4800f0UL +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR ( \ + is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR : \ + is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR : \ + is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR : \ + /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR) #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL + union uvh_rh_gam_alias210_redirect_config_2_mmr_u { unsigned long v; struct uvh_rh_gam_alias210_redirect_config_2_mmr_s { @@ -2281,14 +3191,17 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_CONFIG_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_CONFIG_MMR 0x1600000UL #define UV1H_RH_GAM_CONFIG_MMR 0x1600000UL #define UV2H_RH_GAM_CONFIG_MMR 0x1600000UL #define UV3H_RH_GAM_CONFIG_MMR 0x1600000UL +#define UV4H_RH_GAM_CONFIG_MMR 0x480000UL +#define UVH_RH_GAM_CONFIG_MMR ( \ + is_uv1_hub() ? UV1H_RH_GAM_CONFIG_MMR : \ + is_uv2_hub() ? UV2H_RH_GAM_CONFIG_MMR : \ + is_uv3_hub() ? UV3H_RH_GAM_CONFIG_MMR : \ + /*is_uv4_hub*/ UV4H_RH_GAM_CONFIG_MMR) -#define UVH_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 #define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 -#define UVH_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL #define UVH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL #define UV1H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 @@ -2298,9 +3211,7 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u { #define UV1H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL #define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK 0x0000000000001000UL -#define UVXH_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 #define UVXH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 -#define UVXH_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL #define UVXH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL #define UV2H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 @@ -2313,10 +3224,14 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u { #define UV3H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL #define UV3H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL +#define UV4H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 +#define UV4H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL + + union uvh_rh_gam_config_mmr_u { unsigned long v; struct uvh_rh_gam_config_mmr_s { - unsigned long m_skt:6; /* RW */ + unsigned long rsvd_0_5:6; unsigned long n_skt:4; /* RW */ unsigned long rsvd_10_63:54; } s; @@ -2328,7 +3243,7 @@ union uvh_rh_gam_config_mmr_u { unsigned long rsvd_13_63:51; } s1; struct uvxh_rh_gam_config_mmr_s { - unsigned long m_skt:6; /* RW */ + unsigned long rsvd_0_5:6; unsigned long n_skt:4; /* RW */ unsigned long rsvd_10_63:54; } sx; @@ -2342,20 +3257,28 @@ union uvh_rh_gam_config_mmr_u { unsigned long n_skt:4; /* RW */ unsigned long rsvd_10_63:54; } s3; + struct uv4h_rh_gam_config_mmr_s { + unsigned long rsvd_0_5:6; + unsigned long n_skt:4; /* RW */ + unsigned long rsvd_10_63:54; + } s4; }; /* ========================================================================= */ /* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL #define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL #define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL #define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL +#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x480010UL +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR ( \ + is_uv1_hub() ? UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR : \ + is_uv2_hub() ? UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR : \ + is_uv3_hub() ? UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR : \ + /*is_uv4_hub*/ UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR) -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL @@ -2368,10 +3291,8 @@ union uvh_rh_gam_config_mmr_u { #define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL #define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL -#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 #define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 #define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL #define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL #define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL @@ -2391,12 +3312,28 @@ union uvh_rh_gam_config_mmr_u { #define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_MODE_MASK 0x4000000000000000UL #define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL +#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 26 +#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 +#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL +#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL +#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK ( \ + is_uv1_hub() ? UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK : \ + is_uv2_hub() ? UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK : \ + is_uv3_hub() ? UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK : \ + /*is_uv4_hub*/ UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK) +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT ( \ + is_uv1_hub() ? UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT : \ + is_uv2_hub() ? UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT : \ + is_uv3_hub() ? UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT : \ + /*is_uv4_hub*/ UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT) + union uvh_rh_gam_gru_overlay_config_mmr_u { unsigned long v; struct uvh_rh_gam_gru_overlay_config_mmr_s { - unsigned long rsvd_0_27:28; - unsigned long base:18; /* RW */ - unsigned long rsvd_46_51:6; + unsigned long rsvd_0_51:52; unsigned long n_gru:4; /* RW */ unsigned long rsvd_56_62:7; unsigned long enable:1; /* RW */ @@ -2412,8 +3349,7 @@ union uvh_rh_gam_gru_overlay_config_mmr_u { unsigned long enable:1; /* RW */ } s1; struct uvxh_rh_gam_gru_overlay_config_mmr_s { - unsigned long rsvd_0_27:28; - unsigned long base:18; /* RW */ + unsigned long rsvd_0_45:46; unsigned long rsvd_46_51:6; unsigned long n_gru:4; /* RW */ unsigned long rsvd_56_62:7; @@ -2436,6 +3372,15 @@ union uvh_rh_gam_gru_overlay_config_mmr_u { unsigned long mode:1; /* RW */ unsigned long enable:1; /* RW */ } s3; + struct uv4h_rh_gam_gru_overlay_config_mmr_s { + unsigned long rsvd_0_24:25; + unsigned long undef_25:1; /* Undefined */ + unsigned long base:20; /* RW */ + unsigned long rsvd_46_51:6; + unsigned long n_gru:4; /* RW */ + unsigned long rsvd_56_62:7; + unsigned long enable:1; /* RW */ + } s4; }; /* ========================================================================= */ @@ -2443,6 +3388,14 @@ union uvh_rh_gam_gru_overlay_config_mmr_u { /* ========================================================================= */ #define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR uv_undefined("UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR") +#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR uv_undefined("UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR") +#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR ( \ + is_uv1_hub() ? UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR : \ + is_uv2_hub() ? UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR : \ + is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR : \ + /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR) + #define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30 #define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46 @@ -2453,6 +3406,7 @@ union uvh_rh_gam_gru_overlay_config_mmr_u { #define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL #define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 27 #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46 #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52 @@ -2462,6 +3416,7 @@ union uvh_rh_gam_gru_overlay_config_mmr_u { #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + union uvh_rh_gam_mmioh_overlay_config_mmr_u { unsigned long v; struct uv1h_rh_gam_mmioh_overlay_config_mmr_s { @@ -2485,10 +3440,15 @@ union uvh_rh_gam_mmioh_overlay_config_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL #define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL #define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL #define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL +#define UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x480028UL +#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR ( \ + is_uv1_hub() ? UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR : \ + is_uv2_hub() ? UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR : \ + is_uv3_hub() ? UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR : \ + /*is_uv4_hub*/ UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR) #define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 #define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 @@ -2517,6 +3477,12 @@ union uvh_rh_gam_mmioh_overlay_config_mmr_u { #define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL #define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL +#define UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 +#define UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL +#define UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + + union uvh_rh_gam_mmr_overlay_config_mmr_u { unsigned long v; struct uvh_rh_gam_mmr_overlay_config_mmr_s { @@ -2550,16 +3516,31 @@ union uvh_rh_gam_mmr_overlay_config_mmr_u { unsigned long rsvd_46_62:17; unsigned long enable:1; /* RW */ } s3; + struct uv4h_rh_gam_mmr_overlay_config_mmr_s { + unsigned long rsvd_0_25:26; + unsigned long base:20; /* RW */ + unsigned long rsvd_46_62:17; + unsigned long enable:1; /* RW */ + } s4; }; /* ========================================================================= */ /* UVH_RTC */ /* ========================================================================= */ -#define UVH_RTC 0x340000UL +#define UV1H_RTC 0x340000UL +#define UV2H_RTC 0x340000UL +#define UV3H_RTC 0x340000UL +#define UV4H_RTC 0xe0000UL +#define UVH_RTC ( \ + is_uv1_hub() ? UV1H_RTC : \ + is_uv2_hub() ? UV2H_RTC : \ + is_uv3_hub() ? UV3H_RTC : \ + /*is_uv4_hub*/ UV4H_RTC) #define UVH_RTC_REAL_TIME_CLOCK_SHFT 0 #define UVH_RTC_REAL_TIME_CLOCK_MASK 0x00ffffffffffffffUL + union uvh_rtc_u { unsigned long v; struct uvh_rtc_s { @@ -2590,6 +3571,7 @@ union uvh_rtc_u { #define UVH_RTC1_INT_CONFIG_M_MASK 0x0000000000010000UL #define UVH_RTC1_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL + union uvh_rtc1_int_config_u { unsigned long v; struct uvh_rtc1_int_config_s { @@ -2609,12 +3591,30 @@ union uvh_rtc1_int_config_u { /* ========================================================================= */ /* UVH_SCRATCH5 */ /* ========================================================================= */ -#define UVH_SCRATCH5 0x2d0200UL -#define UVH_SCRATCH5_32 0x778 +#define UV1H_SCRATCH5 0x2d0200UL +#define UV2H_SCRATCH5 0x2d0200UL +#define UV3H_SCRATCH5 0x2d0200UL +#define UV4H_SCRATCH5 0xb0200UL +#define UVH_SCRATCH5 ( \ + is_uv1_hub() ? UV1H_SCRATCH5 : \ + is_uv2_hub() ? UV2H_SCRATCH5 : \ + is_uv3_hub() ? UV3H_SCRATCH5 : \ + /*is_uv4_hub*/ UV4H_SCRATCH5) + +#define UV1H_SCRATCH5_32 0x778 +#define UV2H_SCRATCH5_32 0x778 +#define UV3H_SCRATCH5_32 0x778 +#define UV4H_SCRATCH5_32 0x798 +#define UVH_SCRATCH5_32 ( \ + is_uv1_hub() ? UV1H_SCRATCH5_32 : \ + is_uv2_hub() ? UV2H_SCRATCH5_32 : \ + is_uv3_hub() ? UV3H_SCRATCH5_32 : \ + /*is_uv4_hub*/ UV4H_SCRATCH5_32) #define UVH_SCRATCH5_SCRATCH5_SHFT 0 #define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL + union uvh_scratch5_u { unsigned long v; struct uvh_scratch5_s { @@ -2625,14 +3625,39 @@ union uvh_scratch5_u { /* ========================================================================= */ /* UVH_SCRATCH5_ALIAS */ /* ========================================================================= */ -#define UVH_SCRATCH5_ALIAS 0x2d0208UL -#define UVH_SCRATCH5_ALIAS_32 0x780 +#define UV1H_SCRATCH5_ALIAS 0x2d0208UL +#define UV2H_SCRATCH5_ALIAS 0x2d0208UL +#define UV3H_SCRATCH5_ALIAS 0x2d0208UL +#define UV4H_SCRATCH5_ALIAS 0xb0208UL +#define UVH_SCRATCH5_ALIAS ( \ + is_uv1_hub() ? UV1H_SCRATCH5_ALIAS : \ + is_uv2_hub() ? UV2H_SCRATCH5_ALIAS : \ + is_uv3_hub() ? UV3H_SCRATCH5_ALIAS : \ + /*is_uv4_hub*/ UV4H_SCRATCH5_ALIAS) + +#define UV1H_SCRATCH5_ALIAS_32 0x780 +#define UV2H_SCRATCH5_ALIAS_32 0x780 +#define UV3H_SCRATCH5_ALIAS_32 0x780 +#define UV4H_SCRATCH5_ALIAS_32 0x7a0 +#define UVH_SCRATCH5_ALIAS_32 ( \ + is_uv1_hub() ? UV1H_SCRATCH5_ALIAS_32 : \ + is_uv2_hub() ? UV2H_SCRATCH5_ALIAS_32 : \ + is_uv3_hub() ? UV3H_SCRATCH5_ALIAS_32 : \ + /*is_uv4_hub*/ UV4H_SCRATCH5_ALIAS_32) /* ========================================================================= */ /* UVH_SCRATCH5_ALIAS_2 */ /* ========================================================================= */ -#define UVH_SCRATCH5_ALIAS_2 0x2d0210UL +#define UV1H_SCRATCH5_ALIAS_2 0x2d0210UL +#define UV2H_SCRATCH5_ALIAS_2 0x2d0210UL +#define UV3H_SCRATCH5_ALIAS_2 0x2d0210UL +#define UV4H_SCRATCH5_ALIAS_2 0xb0210UL +#define UVH_SCRATCH5_ALIAS_2 ( \ + is_uv1_hub() ? UV1H_SCRATCH5_ALIAS_2 : \ + is_uv2_hub() ? UV2H_SCRATCH5_ALIAS_2 : \ + is_uv3_hub() ? UV3H_SCRATCH5_ALIAS_2 : \ + /*is_uv4_hub*/ UV4H_SCRATCH5_ALIAS_2) #define UVH_SCRATCH5_ALIAS_2_32 0x788 @@ -2640,76 +3665,255 @@ union uvh_scratch5_u { /* UVXH_EVENT_OCCURRED2 */ /* ========================================================================= */ #define UVXH_EVENT_OCCURRED2 0x70100UL -#define UVXH_EVENT_OCCURRED2_32 0xb68 - -#define UVXH_EVENT_OCCURRED2_RTC_0_SHFT 0 -#define UVXH_EVENT_OCCURRED2_RTC_1_SHFT 1 -#define UVXH_EVENT_OCCURRED2_RTC_2_SHFT 2 -#define UVXH_EVENT_OCCURRED2_RTC_3_SHFT 3 -#define UVXH_EVENT_OCCURRED2_RTC_4_SHFT 4 -#define UVXH_EVENT_OCCURRED2_RTC_5_SHFT 5 -#define UVXH_EVENT_OCCURRED2_RTC_6_SHFT 6 -#define UVXH_EVENT_OCCURRED2_RTC_7_SHFT 7 -#define UVXH_EVENT_OCCURRED2_RTC_8_SHFT 8 -#define UVXH_EVENT_OCCURRED2_RTC_9_SHFT 9 -#define UVXH_EVENT_OCCURRED2_RTC_10_SHFT 10 -#define UVXH_EVENT_OCCURRED2_RTC_11_SHFT 11 -#define UVXH_EVENT_OCCURRED2_RTC_12_SHFT 12 -#define UVXH_EVENT_OCCURRED2_RTC_13_SHFT 13 -#define UVXH_EVENT_OCCURRED2_RTC_14_SHFT 14 -#define UVXH_EVENT_OCCURRED2_RTC_15_SHFT 15 -#define UVXH_EVENT_OCCURRED2_RTC_16_SHFT 16 -#define UVXH_EVENT_OCCURRED2_RTC_17_SHFT 17 -#define UVXH_EVENT_OCCURRED2_RTC_18_SHFT 18 -#define UVXH_EVENT_OCCURRED2_RTC_19_SHFT 19 -#define UVXH_EVENT_OCCURRED2_RTC_20_SHFT 20 -#define UVXH_EVENT_OCCURRED2_RTC_21_SHFT 21 -#define UVXH_EVENT_OCCURRED2_RTC_22_SHFT 22 -#define UVXH_EVENT_OCCURRED2_RTC_23_SHFT 23 -#define UVXH_EVENT_OCCURRED2_RTC_24_SHFT 24 -#define UVXH_EVENT_OCCURRED2_RTC_25_SHFT 25 -#define UVXH_EVENT_OCCURRED2_RTC_26_SHFT 26 -#define UVXH_EVENT_OCCURRED2_RTC_27_SHFT 27 -#define UVXH_EVENT_OCCURRED2_RTC_28_SHFT 28 -#define UVXH_EVENT_OCCURRED2_RTC_29_SHFT 29 -#define UVXH_EVENT_OCCURRED2_RTC_30_SHFT 30 -#define UVXH_EVENT_OCCURRED2_RTC_31_SHFT 31 -#define UVXH_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000001UL -#define UVXH_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000002UL -#define UVXH_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000004UL -#define UVXH_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000008UL -#define UVXH_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000010UL -#define UVXH_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000020UL -#define UVXH_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000040UL -#define UVXH_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000080UL -#define UVXH_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000100UL -#define UVXH_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000200UL -#define UVXH_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000000400UL -#define UVXH_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000000800UL -#define UVXH_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000001000UL -#define UVXH_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000002000UL -#define UVXH_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000004000UL -#define UVXH_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000008000UL -#define UVXH_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000010000UL -#define UVXH_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000020000UL -#define UVXH_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000040000UL -#define UVXH_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000080000UL -#define UVXH_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000100000UL -#define UVXH_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000200000UL -#define UVXH_EVENT_OCCURRED2_RTC_22_MASK 0x0000000000400000UL -#define UVXH_EVENT_OCCURRED2_RTC_23_MASK 0x0000000000800000UL -#define UVXH_EVENT_OCCURRED2_RTC_24_MASK 0x0000000001000000UL -#define UVXH_EVENT_OCCURRED2_RTC_25_MASK 0x0000000002000000UL -#define UVXH_EVENT_OCCURRED2_RTC_26_MASK 0x0000000004000000UL -#define UVXH_EVENT_OCCURRED2_RTC_27_MASK 0x0000000008000000UL -#define UVXH_EVENT_OCCURRED2_RTC_28_MASK 0x0000000010000000UL -#define UVXH_EVENT_OCCURRED2_RTC_29_MASK 0x0000000020000000UL -#define UVXH_EVENT_OCCURRED2_RTC_30_MASK 0x0000000040000000UL -#define UVXH_EVENT_OCCURRED2_RTC_31_MASK 0x0000000080000000UL - -union uvxh_event_occurred2_u { + +#define UV2H_EVENT_OCCURRED2_32 0xb68 +#define UV3H_EVENT_OCCURRED2_32 0xb68 +#define UV4H_EVENT_OCCURRED2_32 0x608 +#define UVH_EVENT_OCCURRED2_32 ( \ + is_uv2_hub() ? UV2H_EVENT_OCCURRED2_32 : \ + is_uv3_hub() ? UV3H_EVENT_OCCURRED2_32 : \ + /*is_uv4_hub*/ UV4H_EVENT_OCCURRED2_32) + + +#define UV2H_EVENT_OCCURRED2_RTC_0_SHFT 0 +#define UV2H_EVENT_OCCURRED2_RTC_1_SHFT 1 +#define UV2H_EVENT_OCCURRED2_RTC_2_SHFT 2 +#define UV2H_EVENT_OCCURRED2_RTC_3_SHFT 3 +#define UV2H_EVENT_OCCURRED2_RTC_4_SHFT 4 +#define UV2H_EVENT_OCCURRED2_RTC_5_SHFT 5 +#define UV2H_EVENT_OCCURRED2_RTC_6_SHFT 6 +#define UV2H_EVENT_OCCURRED2_RTC_7_SHFT 7 +#define UV2H_EVENT_OCCURRED2_RTC_8_SHFT 8 +#define UV2H_EVENT_OCCURRED2_RTC_9_SHFT 9 +#define UV2H_EVENT_OCCURRED2_RTC_10_SHFT 10 +#define UV2H_EVENT_OCCURRED2_RTC_11_SHFT 11 +#define UV2H_EVENT_OCCURRED2_RTC_12_SHFT 12 +#define UV2H_EVENT_OCCURRED2_RTC_13_SHFT 13 +#define UV2H_EVENT_OCCURRED2_RTC_14_SHFT 14 +#define UV2H_EVENT_OCCURRED2_RTC_15_SHFT 15 +#define UV2H_EVENT_OCCURRED2_RTC_16_SHFT 16 +#define UV2H_EVENT_OCCURRED2_RTC_17_SHFT 17 +#define UV2H_EVENT_OCCURRED2_RTC_18_SHFT 18 +#define UV2H_EVENT_OCCURRED2_RTC_19_SHFT 19 +#define UV2H_EVENT_OCCURRED2_RTC_20_SHFT 20 +#define UV2H_EVENT_OCCURRED2_RTC_21_SHFT 21 +#define UV2H_EVENT_OCCURRED2_RTC_22_SHFT 22 +#define UV2H_EVENT_OCCURRED2_RTC_23_SHFT 23 +#define UV2H_EVENT_OCCURRED2_RTC_24_SHFT 24 +#define UV2H_EVENT_OCCURRED2_RTC_25_SHFT 25 +#define UV2H_EVENT_OCCURRED2_RTC_26_SHFT 26 +#define UV2H_EVENT_OCCURRED2_RTC_27_SHFT 27 +#define UV2H_EVENT_OCCURRED2_RTC_28_SHFT 28 +#define UV2H_EVENT_OCCURRED2_RTC_29_SHFT 29 +#define UV2H_EVENT_OCCURRED2_RTC_30_SHFT 30 +#define UV2H_EVENT_OCCURRED2_RTC_31_SHFT 31 +#define UV2H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000001UL +#define UV2H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000002UL +#define UV2H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000004UL +#define UV2H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000008UL +#define UV2H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000010UL +#define UV2H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000020UL +#define UV2H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000040UL +#define UV2H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000080UL +#define UV2H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000100UL +#define UV2H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000200UL +#define UV2H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000000400UL +#define UV2H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000000800UL +#define UV2H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000001000UL +#define UV2H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000002000UL +#define UV2H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000004000UL +#define UV2H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000008000UL +#define UV2H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000010000UL +#define UV2H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000020000UL +#define UV2H_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000040000UL +#define UV2H_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000080000UL +#define UV2H_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000100000UL +#define UV2H_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000200000UL +#define UV2H_EVENT_OCCURRED2_RTC_22_MASK 0x0000000000400000UL +#define UV2H_EVENT_OCCURRED2_RTC_23_MASK 0x0000000000800000UL +#define UV2H_EVENT_OCCURRED2_RTC_24_MASK 0x0000000001000000UL +#define UV2H_EVENT_OCCURRED2_RTC_25_MASK 0x0000000002000000UL +#define UV2H_EVENT_OCCURRED2_RTC_26_MASK 0x0000000004000000UL +#define UV2H_EVENT_OCCURRED2_RTC_27_MASK 0x0000000008000000UL +#define UV2H_EVENT_OCCURRED2_RTC_28_MASK 0x0000000010000000UL +#define UV2H_EVENT_OCCURRED2_RTC_29_MASK 0x0000000020000000UL +#define UV2H_EVENT_OCCURRED2_RTC_30_MASK 0x0000000040000000UL +#define UV2H_EVENT_OCCURRED2_RTC_31_MASK 0x0000000080000000UL + +#define UV3H_EVENT_OCCURRED2_RTC_0_SHFT 0 +#define UV3H_EVENT_OCCURRED2_RTC_1_SHFT 1 +#define UV3H_EVENT_OCCURRED2_RTC_2_SHFT 2 +#define UV3H_EVENT_OCCURRED2_RTC_3_SHFT 3 +#define UV3H_EVENT_OCCURRED2_RTC_4_SHFT 4 +#define UV3H_EVENT_OCCURRED2_RTC_5_SHFT 5 +#define UV3H_EVENT_OCCURRED2_RTC_6_SHFT 6 +#define UV3H_EVENT_OCCURRED2_RTC_7_SHFT 7 +#define UV3H_EVENT_OCCURRED2_RTC_8_SHFT 8 +#define UV3H_EVENT_OCCURRED2_RTC_9_SHFT 9 +#define UV3H_EVENT_OCCURRED2_RTC_10_SHFT 10 +#define UV3H_EVENT_OCCURRED2_RTC_11_SHFT 11 +#define UV3H_EVENT_OCCURRED2_RTC_12_SHFT 12 +#define UV3H_EVENT_OCCURRED2_RTC_13_SHFT 13 +#define UV3H_EVENT_OCCURRED2_RTC_14_SHFT 14 +#define UV3H_EVENT_OCCURRED2_RTC_15_SHFT 15 +#define UV3H_EVENT_OCCURRED2_RTC_16_SHFT 16 +#define UV3H_EVENT_OCCURRED2_RTC_17_SHFT 17 +#define UV3H_EVENT_OCCURRED2_RTC_18_SHFT 18 +#define UV3H_EVENT_OCCURRED2_RTC_19_SHFT 19 +#define UV3H_EVENT_OCCURRED2_RTC_20_SHFT 20 +#define UV3H_EVENT_OCCURRED2_RTC_21_SHFT 21 +#define UV3H_EVENT_OCCURRED2_RTC_22_SHFT 22 +#define UV3H_EVENT_OCCURRED2_RTC_23_SHFT 23 +#define UV3H_EVENT_OCCURRED2_RTC_24_SHFT 24 +#define UV3H_EVENT_OCCURRED2_RTC_25_SHFT 25 +#define UV3H_EVENT_OCCURRED2_RTC_26_SHFT 26 +#define UV3H_EVENT_OCCURRED2_RTC_27_SHFT 27 +#define UV3H_EVENT_OCCURRED2_RTC_28_SHFT 28 +#define UV3H_EVENT_OCCURRED2_RTC_29_SHFT 29 +#define UV3H_EVENT_OCCURRED2_RTC_30_SHFT 30 +#define UV3H_EVENT_OCCURRED2_RTC_31_SHFT 31 +#define UV3H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000001UL +#define UV3H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000002UL +#define UV3H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000004UL +#define UV3H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000008UL +#define UV3H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000010UL +#define UV3H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000020UL +#define UV3H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000040UL +#define UV3H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000080UL +#define UV3H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000100UL +#define UV3H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000200UL +#define UV3H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000000400UL +#define UV3H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000000800UL +#define UV3H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000001000UL +#define UV3H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000002000UL +#define UV3H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000004000UL +#define UV3H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000008000UL +#define UV3H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000010000UL +#define UV3H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000020000UL +#define UV3H_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000040000UL +#define UV3H_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000080000UL +#define UV3H_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000100000UL +#define UV3H_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000200000UL +#define UV3H_EVENT_OCCURRED2_RTC_22_MASK 0x0000000000400000UL +#define UV3H_EVENT_OCCURRED2_RTC_23_MASK 0x0000000000800000UL +#define UV3H_EVENT_OCCURRED2_RTC_24_MASK 0x0000000001000000UL +#define UV3H_EVENT_OCCURRED2_RTC_25_MASK 0x0000000002000000UL +#define UV3H_EVENT_OCCURRED2_RTC_26_MASK 0x0000000004000000UL +#define UV3H_EVENT_OCCURRED2_RTC_27_MASK 0x0000000008000000UL +#define UV3H_EVENT_OCCURRED2_RTC_28_MASK 0x0000000010000000UL +#define UV3H_EVENT_OCCURRED2_RTC_29_MASK 0x0000000020000000UL +#define UV3H_EVENT_OCCURRED2_RTC_30_MASK 0x0000000040000000UL +#define UV3H_EVENT_OCCURRED2_RTC_31_MASK 0x0000000080000000UL + +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT0_SHFT 0 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT1_SHFT 1 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT2_SHFT 2 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT3_SHFT 3 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT4_SHFT 4 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT5_SHFT 5 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT6_SHFT 6 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT7_SHFT 7 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT8_SHFT 8 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT9_SHFT 9 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT10_SHFT 10 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT11_SHFT 11 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT12_SHFT 12 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT13_SHFT 13 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT14_SHFT 14 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT15_SHFT 15 +#define UV4H_EVENT_OCCURRED2_RTC_INTERVAL_INT_SHFT 16 +#define UV4H_EVENT_OCCURRED2_BAU_DASHBOARD_INT_SHFT 17 +#define UV4H_EVENT_OCCURRED2_RTC_0_SHFT 18 +#define UV4H_EVENT_OCCURRED2_RTC_1_SHFT 19 +#define UV4H_EVENT_OCCURRED2_RTC_2_SHFT 20 +#define UV4H_EVENT_OCCURRED2_RTC_3_SHFT 21 +#define UV4H_EVENT_OCCURRED2_RTC_4_SHFT 22 +#define UV4H_EVENT_OCCURRED2_RTC_5_SHFT 23 +#define UV4H_EVENT_OCCURRED2_RTC_6_SHFT 24 +#define UV4H_EVENT_OCCURRED2_RTC_7_SHFT 25 +#define UV4H_EVENT_OCCURRED2_RTC_8_SHFT 26 +#define UV4H_EVENT_OCCURRED2_RTC_9_SHFT 27 +#define UV4H_EVENT_OCCURRED2_RTC_10_SHFT 28 +#define UV4H_EVENT_OCCURRED2_RTC_11_SHFT 29 +#define UV4H_EVENT_OCCURRED2_RTC_12_SHFT 30 +#define UV4H_EVENT_OCCURRED2_RTC_13_SHFT 31 +#define UV4H_EVENT_OCCURRED2_RTC_14_SHFT 32 +#define UV4H_EVENT_OCCURRED2_RTC_15_SHFT 33 +#define UV4H_EVENT_OCCURRED2_RTC_16_SHFT 34 +#define UV4H_EVENT_OCCURRED2_RTC_17_SHFT 35 +#define UV4H_EVENT_OCCURRED2_RTC_18_SHFT 36 +#define UV4H_EVENT_OCCURRED2_RTC_19_SHFT 37 +#define UV4H_EVENT_OCCURRED2_RTC_20_SHFT 38 +#define UV4H_EVENT_OCCURRED2_RTC_21_SHFT 39 +#define UV4H_EVENT_OCCURRED2_RTC_22_SHFT 40 +#define UV4H_EVENT_OCCURRED2_RTC_23_SHFT 41 +#define UV4H_EVENT_OCCURRED2_RTC_24_SHFT 42 +#define UV4H_EVENT_OCCURRED2_RTC_25_SHFT 43 +#define UV4H_EVENT_OCCURRED2_RTC_26_SHFT 44 +#define UV4H_EVENT_OCCURRED2_RTC_27_SHFT 45 +#define UV4H_EVENT_OCCURRED2_RTC_28_SHFT 46 +#define UV4H_EVENT_OCCURRED2_RTC_29_SHFT 47 +#define UV4H_EVENT_OCCURRED2_RTC_30_SHFT 48 +#define UV4H_EVENT_OCCURRED2_RTC_31_SHFT 49 +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT0_MASK 0x0000000000000001UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT1_MASK 0x0000000000000002UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT2_MASK 0x0000000000000004UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT3_MASK 0x0000000000000008UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT4_MASK 0x0000000000000010UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT5_MASK 0x0000000000000020UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT6_MASK 0x0000000000000040UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT7_MASK 0x0000000000000080UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT8_MASK 0x0000000000000100UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT9_MASK 0x0000000000000200UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT10_MASK 0x0000000000000400UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT11_MASK 0x0000000000000800UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT12_MASK 0x0000000000001000UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT13_MASK 0x0000000000002000UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT14_MASK 0x0000000000004000UL +#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT15_MASK 0x0000000000008000UL +#define UV4H_EVENT_OCCURRED2_RTC_INTERVAL_INT_MASK 0x0000000000010000UL +#define UV4H_EVENT_OCCURRED2_BAU_DASHBOARD_INT_MASK 0x0000000000020000UL +#define UV4H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000040000UL +#define UV4H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000080000UL +#define UV4H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000100000UL +#define UV4H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000200000UL +#define UV4H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000400000UL +#define UV4H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000800000UL +#define UV4H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000001000000UL +#define UV4H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000002000000UL +#define UV4H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000004000000UL +#define UV4H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000008000000UL +#define UV4H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000010000000UL +#define UV4H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000020000000UL +#define UV4H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000040000000UL +#define UV4H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000080000000UL +#define UV4H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000100000000UL +#define UV4H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000200000000UL +#define UV4H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000400000000UL +#define UV4H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000800000000UL +#define UV4H_EVENT_OCCURRED2_RTC_18_MASK 0x0000001000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_19_MASK 0x0000002000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_20_MASK 0x0000004000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_21_MASK 0x0000008000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_22_MASK 0x0000010000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_23_MASK 0x0000020000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_24_MASK 0x0000040000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_25_MASK 0x0000080000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_26_MASK 0x0000100000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_27_MASK 0x0000200000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_28_MASK 0x0000400000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_29_MASK 0x0000800000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_30_MASK 0x0001000000000000UL +#define UV4H_EVENT_OCCURRED2_RTC_31_MASK 0x0002000000000000UL + +#define UVXH_EVENT_OCCURRED2_RTC_1_MASK ( \ + is_uv2_hub() ? UV2H_EVENT_OCCURRED2_RTC_1_MASK : \ + is_uv3_hub() ? UV3H_EVENT_OCCURRED2_RTC_1_MASK : \ + /*is_uv4_hub*/ UV4H_EVENT_OCCURRED2_RTC_1_MASK) + +union uvh_event_occurred2_u { unsigned long v; - struct uvxh_event_occurred2_s { + struct uv2h_event_occurred2_s { unsigned long rtc_0:1; /* RW */ unsigned long rtc_1:1; /* RW */ unsigned long rtc_2:1; /* RW */ @@ -2743,25 +3947,129 @@ union uvxh_event_occurred2_u { unsigned long rtc_30:1; /* RW */ unsigned long rtc_31:1; /* RW */ unsigned long rsvd_32_63:32; - } sx; + } s2; + struct uv3h_event_occurred2_s { + unsigned long rtc_0:1; /* RW */ + unsigned long rtc_1:1; /* RW */ + unsigned long rtc_2:1; /* RW */ + unsigned long rtc_3:1; /* RW */ + unsigned long rtc_4:1; /* RW */ + unsigned long rtc_5:1; /* RW */ + unsigned long rtc_6:1; /* RW */ + unsigned long rtc_7:1; /* RW */ + unsigned long rtc_8:1; /* RW */ + unsigned long rtc_9:1; /* RW */ + unsigned long rtc_10:1; /* RW */ + unsigned long rtc_11:1; /* RW */ + unsigned long rtc_12:1; /* RW */ + unsigned long rtc_13:1; /* RW */ + unsigned long rtc_14:1; /* RW */ + unsigned long rtc_15:1; /* RW */ + unsigned long rtc_16:1; /* RW */ + unsigned long rtc_17:1; /* RW */ + unsigned long rtc_18:1; /* RW */ + unsigned long rtc_19:1; /* RW */ + unsigned long rtc_20:1; /* RW */ + unsigned long rtc_21:1; /* RW */ + unsigned long rtc_22:1; /* RW */ + unsigned long rtc_23:1; /* RW */ + unsigned long rtc_24:1; /* RW */ + unsigned long rtc_25:1; /* RW */ + unsigned long rtc_26:1; /* RW */ + unsigned long rtc_27:1; /* RW */ + unsigned long rtc_28:1; /* RW */ + unsigned long rtc_29:1; /* RW */ + unsigned long rtc_30:1; /* RW */ + unsigned long rtc_31:1; /* RW */ + unsigned long rsvd_32_63:32; + } s3; + struct uv4h_event_occurred2_s { + unsigned long message_accelerator_int0:1; /* RW */ + unsigned long message_accelerator_int1:1; /* RW */ + unsigned long message_accelerator_int2:1; /* RW */ + unsigned long message_accelerator_int3:1; /* RW */ + unsigned long message_accelerator_int4:1; /* RW */ + unsigned long message_accelerator_int5:1; /* RW */ + unsigned long message_accelerator_int6:1; /* RW */ + unsigned long message_accelerator_int7:1; /* RW */ + unsigned long message_accelerator_int8:1; /* RW */ + unsigned long message_accelerator_int9:1; /* RW */ + unsigned long message_accelerator_int10:1; /* RW */ + unsigned long message_accelerator_int11:1; /* RW */ + unsigned long message_accelerator_int12:1; /* RW */ + unsigned long message_accelerator_int13:1; /* RW */ + unsigned long message_accelerator_int14:1; /* RW */ + unsigned long message_accelerator_int15:1; /* RW */ + unsigned long rtc_interval_int:1; /* RW */ + unsigned long bau_dashboard_int:1; /* RW */ + unsigned long rtc_0:1; /* RW */ + unsigned long rtc_1:1; /* RW */ + unsigned long rtc_2:1; /* RW */ + unsigned long rtc_3:1; /* RW */ + unsigned long rtc_4:1; /* RW */ + unsigned long rtc_5:1; /* RW */ + unsigned long rtc_6:1; /* RW */ + unsigned long rtc_7:1; /* RW */ + unsigned long rtc_8:1; /* RW */ + unsigned long rtc_9:1; /* RW */ + unsigned long rtc_10:1; /* RW */ + unsigned long rtc_11:1; /* RW */ + unsigned long rtc_12:1; /* RW */ + unsigned long rtc_13:1; /* RW */ + unsigned long rtc_14:1; /* RW */ + unsigned long rtc_15:1; /* RW */ + unsigned long rtc_16:1; /* RW */ + unsigned long rtc_17:1; /* RW */ + unsigned long rtc_18:1; /* RW */ + unsigned long rtc_19:1; /* RW */ + unsigned long rtc_20:1; /* RW */ + unsigned long rtc_21:1; /* RW */ + unsigned long rtc_22:1; /* RW */ + unsigned long rtc_23:1; /* RW */ + unsigned long rtc_24:1; /* RW */ + unsigned long rtc_25:1; /* RW */ + unsigned long rtc_26:1; /* RW */ + unsigned long rtc_27:1; /* RW */ + unsigned long rtc_28:1; /* RW */ + unsigned long rtc_29:1; /* RW */ + unsigned long rtc_30:1; /* RW */ + unsigned long rtc_31:1; /* RW */ + unsigned long rsvd_50_63:14; + } s4; }; /* ========================================================================= */ /* UVXH_EVENT_OCCURRED2_ALIAS */ /* ========================================================================= */ #define UVXH_EVENT_OCCURRED2_ALIAS 0x70108UL -#define UVXH_EVENT_OCCURRED2_ALIAS_32 0xb70 + +#define UV2H_EVENT_OCCURRED2_ALIAS_32 0xb70 +#define UV3H_EVENT_OCCURRED2_ALIAS_32 0xb70 +#define UV4H_EVENT_OCCURRED2_ALIAS_32 0x610 +#define UVH_EVENT_OCCURRED2_ALIAS_32 ( \ + is_uv2_hub() ? UV2H_EVENT_OCCURRED2_ALIAS_32 : \ + is_uv3_hub() ? UV3H_EVENT_OCCURRED2_ALIAS_32 : \ + /*is_uv4_hub*/ UV4H_EVENT_OCCURRED2_ALIAS_32) /* ========================================================================= */ /* UVXH_LB_BAU_SB_ACTIVATION_STATUS_2 */ /* ========================================================================= */ -#define UVXH_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL #define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL #define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL -#define UVXH_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x9f0 -#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x320130UL -#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x320130UL +#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_2 0xc8130UL +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_2 ( \ + is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 : \ + is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_2 : \ + /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_2) + +#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x9f0 +#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x9f0 +#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0xa10 +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_2_32 ( \ + is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32 : \ + is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_32 : \ + /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_2_32) #define UVXH_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0 #define UVXH_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL @@ -2772,6 +4080,10 @@ union uvxh_event_occurred2_u { #define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0 #define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL +#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0 +#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL + + union uvxh_lb_bau_sb_activation_status_2_u { unsigned long v; struct uvxh_lb_bau_sb_activation_status_2_s { @@ -2783,6 +4095,9 @@ union uvxh_lb_bau_sb_activation_status_2_u { struct uv3h_lb_bau_sb_activation_status_2_s { unsigned long aux_error:64; /* RW */ } s3; + struct uv4h_lb_bau_sb_activation_status_2_s { + unsigned long aux_error:64; /* RW */ + } s4; }; /* ========================================================================= */ @@ -2823,26 +4138,6 @@ union uv3h_gr0_gam_gr_config_u { }; /* ========================================================================= */ -/* UV3H_GR1_GAM_GR_CONFIG */ -/* ========================================================================= */ -#define UV3H_GR1_GAM_GR_CONFIG 0x1000028UL - -#define UV3H_GR1_GAM_GR_CONFIG_M_SKT_SHFT 0 -#define UV3H_GR1_GAM_GR_CONFIG_SUBSPACE_SHFT 10 -#define UV3H_GR1_GAM_GR_CONFIG_M_SKT_MASK 0x000000000000003fUL -#define UV3H_GR1_GAM_GR_CONFIG_SUBSPACE_MASK 0x0000000000000400UL - -union uv3h_gr1_gam_gr_config_u { - unsigned long v; - struct uv3h_gr1_gam_gr_config_s { - unsigned long m_skt:6; /* RW */ - unsigned long undef_6_9:4; /* Undefined */ - unsigned long subspace:1; /* RW */ - unsigned long reserved:53; - } s3; -}; - -/* ========================================================================= */ /* UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR */ /* ========================================================================= */ #define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR 0x1603000UL @@ -2924,5 +4219,67 @@ union uv3h_rh_gam_mmioh_redirect_config1_mmr_u { } s3; }; +/* ========================================================================= */ +/* UV4H_LB_PROC_INTD_QUEUE_FIRST */ +/* ========================================================================= */ +#define UV4H_LB_PROC_INTD_QUEUE_FIRST 0xa4100UL + +#define UV4H_LB_PROC_INTD_QUEUE_FIRST_FIRST_PAYLOAD_ADDRESS_SHFT 6 +#define UV4H_LB_PROC_INTD_QUEUE_FIRST_FIRST_PAYLOAD_ADDRESS_MASK 0x00003fffffffffc0UL + +union uv4h_lb_proc_intd_queue_first_u { + unsigned long v; + struct uv4h_lb_proc_intd_queue_first_s { + unsigned long undef_0_5:6; /* Undefined */ + unsigned long first_payload_address:40; /* RW */ + } s4; +}; + +/* ========================================================================= */ +/* UV4H_LB_PROC_INTD_QUEUE_LAST */ +/* ========================================================================= */ +#define UV4H_LB_PROC_INTD_QUEUE_LAST 0xa4108UL + +#define UV4H_LB_PROC_INTD_QUEUE_LAST_LAST_PAYLOAD_ADDRESS_SHFT 5 +#define UV4H_LB_PROC_INTD_QUEUE_LAST_LAST_PAYLOAD_ADDRESS_MASK 0x00003fffffffffe0UL + +union uv4h_lb_proc_intd_queue_last_u { + unsigned long v; + struct uv4h_lb_proc_intd_queue_last_s { + unsigned long undef_0_4:5; /* Undefined */ + unsigned long last_payload_address:41; /* RW */ + } s4; +}; + +/* ========================================================================= */ +/* UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR */ +/* ========================================================================= */ +#define UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR 0xa4118UL + +#define UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR_SOFT_ACK_PENDING_FLAGS_SHFT 0 +#define UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR_SOFT_ACK_PENDING_FLAGS_MASK 0x00000000000000ffUL + +union uv4h_lb_proc_intd_soft_ack_clear_u { + unsigned long v; + struct uv4h_lb_proc_intd_soft_ack_clear_s { + unsigned long soft_ack_pending_flags:8; /* WP */ + } s4; +}; + +/* ========================================================================= */ +/* UV4H_LB_PROC_INTD_SOFT_ACK_PENDING */ +/* ========================================================================= */ +#define UV4H_LB_PROC_INTD_SOFT_ACK_PENDING 0xa4110UL + +#define UV4H_LB_PROC_INTD_SOFT_ACK_PENDING_SOFT_ACK_FLAGS_SHFT 0 +#define UV4H_LB_PROC_INTD_SOFT_ACK_PENDING_SOFT_ACK_FLAGS_MASK 0x00000000000000ffUL + +union uv4h_lb_proc_intd_soft_ack_pending_u { + unsigned long v; + struct uv4h_lb_proc_intd_soft_ack_pending_s { + unsigned long soft_ack_flags:8; /* RW */ + } s4; +}; + #endif /* _ASM_X86_UV_UV_MMRS_H */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 1ae89a2721d6..4dcdf74dfed8 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -142,6 +142,44 @@ struct x86_cpuinit_ops { struct timespec; /** + * struct x86_legacy_devices - legacy x86 devices + * + * @pnpbios: this platform can have a PNPBIOS. If this is disabled the platform + * is known to never have a PNPBIOS. + * + * These are devices known to require LPC or ISA bus. The definition of legacy + * devices adheres to the ACPI 5.2.9.3 IA-PC Boot Architecture flag + * ACPI_FADT_LEGACY_DEVICES. These devices consist of user visible devices on + * the LPC or ISA bus. User visible devices are devices that have end-user + * accessible connectors (for example, LPT parallel port). Legacy devices on + * the LPC bus consist for example of serial and parallel ports, PS/2 keyboard + * / mouse, and the floppy disk controller. A system that lacks all known + * legacy devices can assume all devices can be detected exclusively via + * standard device enumeration mechanisms including the ACPI namespace. + * + * A system which has does not have ACPI_FADT_LEGACY_DEVICES enabled must not + * have any of the legacy devices enumerated below present. + */ +struct x86_legacy_devices { + int pnpbios; +}; + +/** + * struct x86_legacy_features - legacy x86 features + * + * @rtc: this device has a CMOS real-time clock present + * @ebda_search: it's safe to search for the EBDA signature in the hardware's + * low RAM + * @devices: legacy x86 devices, refer to struct x86_legacy_devices + * documentation for further details. + */ +struct x86_legacy_features { + int rtc; + int ebda_search; + struct x86_legacy_devices devices; +}; + +/** * struct x86_platform_ops - platform specific runtime functions * @calibrate_tsc: calibrate TSC * @get_wallclock: get time from HW clock like RTC etc. @@ -152,6 +190,14 @@ struct timespec; * @save_sched_clock_state: save state for sched_clock() on suspend * @restore_sched_clock_state: restore state for sched_clock() on resume * @apic_post_init: adjust apic if neeeded + * @legacy: legacy features + * @set_legacy_features: override legacy features. Use of this callback + * is highly discouraged. You should only need + * this if your hardware platform requires further + * custom fine tuning far beyong what may be + * possible in x86_early_init_platform_quirks() by + * only using the current x86_hardware_subarch + * semantics. */ struct x86_platform_ops { unsigned long (*calibrate_tsc)(void); @@ -165,6 +211,8 @@ struct x86_platform_ops { void (*save_sched_clock_state)(void); void (*restore_sched_clock_state)(void); void (*apic_post_init)(void); + struct x86_legacy_features legacy; + void (*set_legacy_features)(void); }; struct pci_dev; @@ -186,6 +234,8 @@ extern struct x86_cpuinit_ops x86_cpuinit; extern struct x86_platform_ops x86_platform; extern struct x86_msi_ops x86_msi; extern struct x86_io_apic_ops x86_io_apic_ops; + +extern void x86_early_init_platform_quirks(void); extern void x86_init_noop(void); extern void x86_init_uint_noop(unsigned int unused); diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index c54beb44c4c1..635eac543922 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h @@ -550,7 +550,7 @@ static struct xor_block_template xor_block_pIII_sse = { #define XOR_TRY_TEMPLATES \ do { \ AVX_XOR_SPEED; \ - if (cpu_has_xmm) { \ + if (boot_cpu_has(X86_FEATURE_XMM)) { \ xor_speed(&xor_block_pIII_sse); \ xor_speed(&xor_block_sse_pf64); \ } else if (boot_cpu_has(X86_FEATURE_MMX)) { \ diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h index 7c0a517ec751..22a7b1870a31 100644 --- a/arch/x86/include/asm/xor_avx.h +++ b/arch/x86/include/asm/xor_avx.h @@ -167,12 +167,12 @@ static struct xor_block_template xor_block_avx = { #define AVX_XOR_SPEED \ do { \ - if (cpu_has_avx && cpu_has_osxsave) \ + if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \ xor_speed(&xor_block_avx); \ } while (0) #define AVX_SELECT(FASTEST) \ - (cpu_has_avx && cpu_has_osxsave ? &xor_block_avx : FASTEST) + (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST) #else diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 329254373479..c18ce67495fa 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -157,7 +157,46 @@ struct boot_params { __u8 _pad9[276]; /* 0xeec */ } __attribute__((packed)); -enum { +/** + * enum x86_hardware_subarch - x86 hardware subarchitecture + * + * The x86 hardware_subarch and hardware_subarch_data were added as of the x86 + * boot protocol 2.07 to help distinguish and support custom x86 boot + * sequences. This enum represents accepted values for the x86 + * hardware_subarch. Custom x86 boot sequences (not X86_SUBARCH_PC) do not + * have or simply *cannot* make use of natural stubs like BIOS or EFI, the + * hardware_subarch can be used on the Linux entry path to revector to a + * subarchitecture stub when needed. This subarchitecture stub can be used to + * set up Linux boot parameters or for special care to account for nonstandard + * handling of page tables. + * + * These enums should only ever be used by x86 code, and the code that uses + * it should be well contained and compartamentalized. + * + * KVM and Xen HVM do not have a subarch as these are expected to follow + * standard x86 boot entries. If there is a genuine need for "hypervisor" type + * that should be considered separately in the future. Future guest types + * should seriously consider working with standard x86 boot stubs such as + * the BIOS or EFI boot stubs. + * + * WARNING: this enum is only used for legacy hacks, for platform features that + * are not easily enumerated or discoverable. You should not ever use + * this for new features. + * + * @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard + * PC mechanisms (PCI, ACPI) and doesn't need a special boot flow. + * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest + * @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path, + * which start at asm startup_xen() entry point and later jump to the C + * xen_start_kernel() entry point. Both domU and dom0 type of guests are + * currently supportd through this PV boot path. + * @X86_SUBARCH_INTEL_MID: Used for Intel MID (Mobile Internet Device) platform + * systems which do not have the PCI legacy interfaces. + * @X86_SUBARCH_CE4100: Used for Intel CE media processor (CE4100) SoC for + * for settop boxes and media devices, the use of a subarch for CE4100 + * is more of a hack... + */ +enum x86_hardware_subarch { X86_SUBARCH_PC = 0, X86_SUBARCH_LGUEST, X86_SUBARCH_XEN, diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 616ebd22ef9a..9abf8551c7e4 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -2,7 +2,11 @@ # Makefile for the linux kernel. # -extra-y := head_$(BITS).o head$(BITS).o head.o vmlinux.lds +extra-y := head_$(BITS).o +extra-y += head$(BITS).o +extra-y += ebda.o +extra-y += platform-quirks.o +extra-y += vmlinux.lds CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 8c2f1ef6ca23..f115a58f7c84 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -136,7 +136,7 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) { struct acpi_table_madt *madt = NULL; - if (!cpu_has_apic) + if (!boot_cpu_has(X86_FEATURE_APIC)) return -EINVAL; madt = (struct acpi_table_madt *)table; @@ -913,6 +913,15 @@ late_initcall(hpet_insert_resource); static int __init acpi_parse_fadt(struct acpi_table_header *table) { + if (!(acpi_gbl_FADT.boot_flags & ACPI_FADT_LEGACY_DEVICES)) { + pr_debug("ACPI: no legacy devices present\n"); + x86_platform.legacy.devices.pnpbios = 0; + } + + if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) { + pr_debug("ACPI: not registering RTC platform device\n"); + x86_platform.legacy.rtc = 0; + } #ifdef CONFIG_X86_PM_TIMER /* detect the location of the ACPI PM Timer */ @@ -951,7 +960,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void) { int count; - if (!cpu_has_apic) + if (!boot_cpu_has(X86_FEATURE_APIC)) return -ENODEV; /* @@ -979,7 +988,7 @@ static int __init acpi_parse_madt_lapic_entries(void) int ret; struct acpi_subtable_proc madt_proc[2]; - if (!cpu_has_apic) + if (!boot_cpu_has(X86_FEATURE_APIC)) return -ENODEV; /* @@ -1125,7 +1134,7 @@ static int __init acpi_parse_madt_ioapic_entries(void) if (acpi_disabled || acpi_noirq) return -ENODEV; - if (!cpu_has_apic) + if (!boot_cpu_has(X86_FEATURE_APIC)) return -ENODEV; /* diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 25f909362b7a..5cb272a7a5a3 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -11,6 +11,7 @@ #include <linux/stop_machine.h> #include <linux/slab.h> #include <linux/kdebug.h> +#include <asm/text-patching.h> #include <asm/alternative.h> #include <asm/sections.h> #include <asm/pgtable.h> diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index d356987a04e9..60078a67d7e3 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -607,7 +607,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev) long tapic = apic_read(APIC_TMCCT); unsigned long pm = acpi_pm_read_early(); - if (cpu_has_tsc) + if (boot_cpu_has(X86_FEATURE_TSC)) tsc = rdtsc(); switch (lapic_cal_loops++) { @@ -668,7 +668,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) *delta = (long)res; /* Correct the tsc counter value */ - if (cpu_has_tsc) { + if (boot_cpu_has(X86_FEATURE_TSC)) { res = (((u64)(*deltatsc)) * pm_100ms); do_div(res, deltapm); apic_printk(APIC_VERBOSE, "TSC delta adjusted to " @@ -760,7 +760,7 @@ static int __init calibrate_APIC_clock(void) apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", lapic_timer_frequency); - if (cpu_has_tsc) { + if (boot_cpu_has(X86_FEATURE_TSC)) { apic_printk(APIC_VERBOSE, "..... CPU clock speed is " "%ld.%04ld MHz.\n", (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ), @@ -1085,7 +1085,7 @@ void lapic_shutdown(void) { unsigned long flags; - if (!cpu_has_apic && !apic_from_smp_config()) + if (!boot_cpu_has(X86_FEATURE_APIC) && !apic_from_smp_config()) return; local_irq_save(flags); @@ -1134,7 +1134,7 @@ void __init init_bsp_APIC(void) * Don't do the setup now if we have a SMP BIOS as the * through-I/O-APIC virtual wire mode might be active. */ - if (smp_found_config || !cpu_has_apic) + if (smp_found_config || !boot_cpu_has(X86_FEATURE_APIC)) return; /* @@ -1227,7 +1227,7 @@ void setup_local_APIC(void) unsigned long long tsc = 0, ntsc; long long max_loops = cpu_khz ? cpu_khz : 1000000; - if (cpu_has_tsc) + if (boot_cpu_has(X86_FEATURE_TSC)) tsc = rdtsc(); if (disable_apic) { @@ -1311,7 +1311,7 @@ void setup_local_APIC(void) break; } if (queued) { - if (cpu_has_tsc && cpu_khz) { + if (boot_cpu_has(X86_FEATURE_TSC) && cpu_khz) { ntsc = rdtsc(); max_loops = (cpu_khz << 10) - (ntsc - tsc); } else @@ -1445,7 +1445,7 @@ static void __x2apic_disable(void) { u64 msr; - if (!cpu_has_apic) + if (!boot_cpu_has(X86_FEATURE_APIC)) return; rdmsrl(MSR_IA32_APICBASE, msr); @@ -1561,7 +1561,7 @@ void __init check_x2apic(void) pr_info("x2apic: enabled by BIOS, switching to x2apic ops\n"); x2apic_mode = 1; x2apic_state = X2APIC_ON; - } else if (!cpu_has_x2apic) { + } else if (!boot_cpu_has(X86_FEATURE_X2APIC)) { x2apic_state = X2APIC_DISABLED; } } @@ -1632,7 +1632,7 @@ void __init enable_IR_x2apic(void) */ static int __init detect_init_APIC(void) { - if (!cpu_has_apic) { + if (!boot_cpu_has(X86_FEATURE_APIC)) { pr_info("No local APIC present\n"); return -1; } @@ -1711,14 +1711,14 @@ static int __init detect_init_APIC(void) goto no_apic; case X86_VENDOR_INTEL: if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 || - (boot_cpu_data.x86 == 5 && cpu_has_apic)) + (boot_cpu_data.x86 == 5 && boot_cpu_has(X86_FEATURE_APIC))) break; goto no_apic; default: goto no_apic; } - if (!cpu_has_apic) { + if (!boot_cpu_has(X86_FEATURE_APIC)) { /* * Over-ride BIOS and try to enable the local APIC only if * "lapic" specified. @@ -2233,19 +2233,19 @@ int __init APIC_init_uniprocessor(void) return -1; } #ifdef CONFIG_X86_64 - if (!cpu_has_apic) { + if (!boot_cpu_has(X86_FEATURE_APIC)) { disable_apic = 1; pr_info("Apic disabled by BIOS\n"); return -1; } #else - if (!smp_found_config && !cpu_has_apic) + if (!smp_found_config && !boot_cpu_has(X86_FEATURE_APIC)) return -1; /* * Complain if the BIOS pretends there is one. */ - if (!cpu_has_apic && + if (!boot_cpu_has(X86_FEATURE_APIC) && APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { pr_err("BIOS bug, local APIC 0x%x not detected!...\n", boot_cpu_physical_apicid); @@ -2426,7 +2426,7 @@ static void apic_pm_activate(void) static int __init init_lapic_sysfs(void) { /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ - if (cpu_has_apic) + if (boot_cpu_has(X86_FEATURE_APIC)) register_syscore_ops(&lapic_syscore_ops); return 0; diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 331a7a07c48f..13d19ed58514 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -100,13 +100,13 @@ static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask, static u32 noop_apic_read(u32 reg) { - WARN_ON_ONCE((cpu_has_apic && !disable_apic)); + WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic); return 0; } static void noop_apic_write(u32 reg, u32 v) { - WARN_ON_ONCE(cpu_has_apic && !disable_apic); + WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic); } struct apic apic_noop = { diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index fdb0fbfb1197..84e33ff5a6d5 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1454,7 +1454,7 @@ void native_disable_io_apic(void) ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); } - if (cpu_has_apic || apic_from_smp_config()) + if (boot_cpu_has(X86_FEATURE_APIC) || apic_from_smp_config()) disconnect_bsp_APIC(ioapic_i8259.pin != -1); } diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 28bde88b0085..2a0f225afebd 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -230,7 +230,7 @@ int safe_smp_processor_id(void) { int apicid, cpuid; - if (!cpu_has_apic) + if (!boot_cpu_has(X86_FEATURE_APIC)) return 0; apicid = hard_smp_processor_id(); diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index ef495511f019..a5e400afc563 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -944,7 +944,7 @@ static int __init print_ICs(void) print_PIC(); /* don't print out if apic is not there */ - if (!cpu_has_apic && !apic_from_smp_config()) + if (!boot_cpu_has(X86_FEATURE_APIC) && !apic_from_smp_config()) return 0; print_local_APICs(show_lapic); diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index d7ce96a7daca..29003154fafd 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -48,12 +48,35 @@ static u64 gru_start_paddr, gru_end_paddr; static u64 gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr; static u64 gru_dist_lmask, gru_dist_umask; static union uvh_apicid uvh_apicid; + +/* info derived from CPUID */ +static struct { + unsigned int apicid_shift; + unsigned int apicid_mask; + unsigned int socketid_shift; /* aka pnode_shift for UV1/2/3 */ + unsigned int pnode_mask; + unsigned int gpa_shift; +} uv_cpuid; + int uv_min_hub_revision_id; EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); unsigned int uv_apicid_hibits; EXPORT_SYMBOL_GPL(uv_apicid_hibits); static struct apic apic_x2apic_uv_x; +static struct uv_hub_info_s uv_hub_info_node0; + +/* Set this to use hardware error handler instead of kernel panic */ +static int disable_uv_undefined_panic = 1; +unsigned long uv_undefined(char *str) +{ + if (likely(!disable_uv_undefined_panic)) + panic("UV: error: undefined MMR: %s\n", str); + else + pr_crit("UV: error: undefined MMR: %s\n", str); + return ~0ul; /* cause a machine fault */ +} +EXPORT_SYMBOL(uv_undefined); static unsigned long __init uv_early_read_mmr(unsigned long addr) { @@ -108,21 +131,71 @@ static int __init early_get_pnodeid(void) case UV3_HUB_PART_NUMBER_X: uv_min_hub_revision_id += UV3_HUB_REVISION_BASE; break; + case UV4_HUB_PART_NUMBER: + uv_min_hub_revision_id += UV4_HUB_REVISION_BASE - 1; + break; } uv_hub_info->hub_revision = uv_min_hub_revision_id; - pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1); + uv_cpuid.pnode_mask = (1 << m_n_config.s.n_skt) - 1; + pnode = (node_id.s.node_id >> 1) & uv_cpuid.pnode_mask; + uv_cpuid.gpa_shift = 46; /* default unless changed */ + + pr_info("UV: rev:%d part#:%x nodeid:%04x n_skt:%d pnmsk:%x pn:%x\n", + node_id.s.revision, node_id.s.part_number, node_id.s.node_id, + m_n_config.s.n_skt, uv_cpuid.pnode_mask, pnode); return pnode; } -static void __init early_get_apic_pnode_shift(void) +/* [copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */ +#define SMT_LEVEL 0 /* leaf 0xb SMT level */ +#define INVALID_TYPE 0 /* leaf 0xb sub-leaf types */ +#define SMT_TYPE 1 +#define CORE_TYPE 2 +#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) +#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) + +static void set_x2apic_bits(void) +{ + unsigned int eax, ebx, ecx, edx, sub_index; + unsigned int sid_shift; + + cpuid(0, &eax, &ebx, &ecx, &edx); + if (eax < 0xb) { + pr_info("UV: CPU does not have CPUID.11\n"); + return; + } + cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); + if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) { + pr_info("UV: CPUID.11 not implemented\n"); + return; + } + sid_shift = BITS_SHIFT_NEXT_LEVEL(eax); + sub_index = 1; + do { + cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx); + if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) { + sid_shift = BITS_SHIFT_NEXT_LEVEL(eax); + break; + } + sub_index++; + } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); + uv_cpuid.apicid_shift = 0; + uv_cpuid.apicid_mask = (~(-1 << sid_shift)); + uv_cpuid.socketid_shift = sid_shift; +} + +static void __init early_get_apic_socketid_shift(void) { - uvh_apicid.v = uv_early_read_mmr(UVH_APICID); - if (!uvh_apicid.v) - /* - * Old bios, use default value - */ - uvh_apicid.s.pnode_shift = UV_APIC_PNODE_SHIFT; + if (is_uv2_hub() || is_uv3_hub()) + uvh_apicid.v = uv_early_read_mmr(UVH_APICID); + + set_x2apic_bits(); + + pr_info("UV: apicid_shift:%d apicid_mask:0x%x\n", + uv_cpuid.apicid_shift, uv_cpuid.apicid_mask); + pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n", + uv_cpuid.socketid_shift, uv_cpuid.pnode_mask); } /* @@ -150,13 +223,18 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) if (strncmp(oem_id, "SGI", 3) != 0) return 0; + /* Setup early hub type field in uv_hub_info for Node 0 */ + uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0; + /* * Determine UV arch type. * SGI: UV100/1000 * SGI2: UV2000/3000 * SGI3: UV300 (truncated to 4 chars because of different varieties) + * SGI4: UV400 (truncated to 4 chars because of different varieties) */ uv_hub_info->hub_revision = + !strncmp(oem_id, "SGI4", 4) ? UV4_HUB_REVISION_BASE : !strncmp(oem_id, "SGI3", 4) ? UV3_HUB_REVISION_BASE : !strcmp(oem_id, "SGI2") ? UV2_HUB_REVISION_BASE : !strcmp(oem_id, "SGI") ? UV1_HUB_REVISION_BASE : 0; @@ -165,7 +243,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) goto badbios; pnodeid = early_get_pnodeid(); - early_get_apic_pnode_shift(); + early_get_apic_socketid_shift(); x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; x86_platform.nmi_init = uv_nmi_init; @@ -211,17 +289,11 @@ int is_uv_system(void) } EXPORT_SYMBOL_GPL(is_uv_system); -DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); -EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); - -struct uv_blade_info *uv_blade_info; -EXPORT_SYMBOL_GPL(uv_blade_info); - -short *uv_node_to_blade; -EXPORT_SYMBOL_GPL(uv_node_to_blade); +void **__uv_hub_info_list; +EXPORT_SYMBOL_GPL(__uv_hub_info_list); -short *uv_cpu_to_blade; -EXPORT_SYMBOL_GPL(uv_cpu_to_blade); +DEFINE_PER_CPU(struct uv_cpu_info_s, __uv_cpu_info); +EXPORT_PER_CPU_SYMBOL_GPL(__uv_cpu_info); short uv_possible_blades; EXPORT_SYMBOL_GPL(uv_possible_blades); @@ -229,6 +301,115 @@ EXPORT_SYMBOL_GPL(uv_possible_blades); unsigned long sn_rtc_cycles_per_second; EXPORT_SYMBOL(sn_rtc_cycles_per_second); +/* the following values are used for the per node hub info struct */ +static __initdata unsigned short *_node_to_pnode; +static __initdata unsigned short _min_socket, _max_socket; +static __initdata unsigned short _min_pnode, _max_pnode, _gr_table_len; +static __initdata struct uv_gam_range_entry *uv_gre_table; +static __initdata struct uv_gam_parameters *uv_gp_table; +static __initdata unsigned short *_socket_to_node; +static __initdata unsigned short *_socket_to_pnode; +static __initdata unsigned short *_pnode_to_socket; +static __initdata struct uv_gam_range_s *_gr_table; +#define SOCK_EMPTY ((unsigned short)~0) + +extern int uv_hub_info_version(void) +{ + return UV_HUB_INFO_VERSION; +} +EXPORT_SYMBOL(uv_hub_info_version); + +/* Build GAM range lookup table */ +static __init void build_uv_gr_table(void) +{ + struct uv_gam_range_entry *gre = uv_gre_table; + struct uv_gam_range_s *grt; + unsigned long last_limit = 0, ram_limit = 0; + int bytes, i, sid, lsid = -1; + + if (!gre) + return; + + bytes = _gr_table_len * sizeof(struct uv_gam_range_s); + grt = kzalloc(bytes, GFP_KERNEL); + BUG_ON(!grt); + _gr_table = grt; + + for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { + if (gre->type == UV_GAM_RANGE_TYPE_HOLE) { + if (!ram_limit) { /* mark hole between ram/non-ram */ + ram_limit = last_limit; + last_limit = gre->limit; + lsid++; + continue; + } + last_limit = gre->limit; + pr_info("UV: extra hole in GAM RE table @%d\n", + (int)(gre - uv_gre_table)); + continue; + } + if (_max_socket < gre->sockid) { + pr_err("UV: GAM table sockid(%d) too large(>%d) @%d\n", + gre->sockid, _max_socket, + (int)(gre - uv_gre_table)); + continue; + } + sid = gre->sockid - _min_socket; + if (lsid < sid) { /* new range */ + grt = &_gr_table[sid]; + grt->base = lsid; + grt->nasid = gre->nasid; + grt->limit = last_limit = gre->limit; + lsid = sid; + continue; + } + if (lsid == sid && !ram_limit) { /* update range */ + if (grt->limit == last_limit) { /* .. if contiguous */ + grt->limit = last_limit = gre->limit; + continue; + } + } + if (!ram_limit) { /* non-contiguous ram range */ + grt++; + grt->base = sid - 1; + grt->nasid = gre->nasid; + grt->limit = last_limit = gre->limit; + continue; + } + grt++; /* non-contiguous/non-ram */ + grt->base = grt - _gr_table; /* base is this entry */ + grt->nasid = gre->nasid; + grt->limit = last_limit = gre->limit; + lsid++; + } + + /* shorten table if possible */ + grt++; + i = grt - _gr_table; + if (i < _gr_table_len) { + void *ret; + + bytes = i * sizeof(struct uv_gam_range_s); + ret = krealloc(_gr_table, bytes, GFP_KERNEL); + if (ret) { + _gr_table = ret; + _gr_table_len = i; + } + } + + /* display resultant gam range table */ + for (i = 0, grt = _gr_table; i < _gr_table_len; i++, grt++) { + int gb = grt->base; + unsigned long start = gb < 0 ? 0 : + (unsigned long)_gr_table[gb].limit << UV_GAM_RANGE_SHFT; + unsigned long end = + (unsigned long)grt->limit << UV_GAM_RANGE_SHFT; + + pr_info("UV: GAM Range %2d %04x 0x%013lx-0x%013lx (%d)\n", + i, grt->nasid, start, end, gb); + } +} + static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) { unsigned long val; @@ -355,7 +536,6 @@ static unsigned long set_apic_id(unsigned int id) static unsigned int uv_read_apic_id(void) { - return x2apic_get_apic_id(apic_read(APIC_ID)); } @@ -430,58 +610,38 @@ static void set_x2apic_extra_bits(int pnode) __this_cpu_write(x2apic_extra_bits, pnode << uvh_apicid.s.pnode_shift); } -/* - * Called on boot cpu. - */ -static __init int boot_pnode_to_blade(int pnode) -{ - int blade; - - for (blade = 0; blade < uv_num_possible_blades(); blade++) - if (pnode == uv_blade_info[blade].pnode) - return blade; - BUG(); -} - -struct redir_addr { - unsigned long redirect; - unsigned long alias; -}; - +#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH 3 #define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT -static __initdata struct redir_addr redir_addrs[] = { - {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR}, - {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR}, - {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR}, -}; - -static unsigned char get_n_lshift(int m_val) -{ - union uv3h_gr0_gam_gr_config_u m_gr_config; - - if (is_uv1_hub()) - return m_val; - - if (is_uv2_hub()) - return m_val == 40 ? 40 : 39; - - m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG); - return m_gr_config.s3.m_skt; -} - static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) { union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias; union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect; + unsigned long m_redirect; + unsigned long m_overlay; int i; - for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) { - alias.v = uv_read_local_mmr(redir_addrs[i].alias); + for (i = 0; i < UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH; i++) { + switch (i) { + case 0: + m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR; + m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR; + break; + case 1: + m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR; + m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR; + break; + case 2: + m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR; + m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR; + break; + } + alias.v = uv_read_local_mmr(m_overlay); if (alias.s.enable && alias.s.base == 0) { *size = (1UL << alias.s.m_alias); - redirect.v = uv_read_local_mmr(redir_addrs[i].redirect); - *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT; + redirect.v = uv_read_local_mmr(m_redirect); + *base = (unsigned long)redirect.s.dest_base + << DEST_SHIFT; return; } } @@ -544,6 +704,8 @@ static __init void map_gru_high(int max_pnode) { union uvh_rh_gam_gru_overlay_config_mmr_u gru; int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; + unsigned long mask = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK; + unsigned long base; gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); if (!gru.s.enable) { @@ -555,8 +717,9 @@ static __init void map_gru_high(int max_pnode) map_gru_distributed(gru.v); return; } - map_high("GRU", gru.s.base, shift, shift, max_pnode, map_wb); - gru_start_paddr = ((u64)gru.s.base << shift); + base = (gru.v & mask) >> shift; + map_high("GRU", base, shift, shift, max_pnode, map_wb); + gru_start_paddr = ((u64)base << shift); gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1); } @@ -595,6 +758,7 @@ static __initdata struct mmioh_config mmiohs[] = { }, }; +/* UV3 & UV4 have identical MMIOH overlay configs */ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode) { union uv3h_rh_gam_mmioh_overlay_config0_mmr_u overlay; @@ -674,7 +838,7 @@ static __init void map_mmioh_high(int min_pnode, int max_pnode) unsigned long mmr, base; int shift, enable, m_io, n_io; - if (is_uv3_hub()) { + if (is_uv3_hub() || is_uv4_hub()) { /* Map both MMIOH Regions */ map_mmioh_high_uv3(0, min_pnode, max_pnode); map_mmioh_high_uv3(1, min_pnode, max_pnode); @@ -739,8 +903,8 @@ static __init void uv_rtc_init(void) */ static void uv_heartbeat(unsigned long ignored) { - struct timer_list *timer = &uv_hub_info->scir.timer; - unsigned char bits = uv_hub_info->scir.state; + struct timer_list *timer = &uv_scir_info->timer; + unsigned char bits = uv_scir_info->state; /* flip heartbeat bit */ bits ^= SCIR_CPU_HEARTBEAT; @@ -760,14 +924,14 @@ static void uv_heartbeat(unsigned long ignored) static void uv_heartbeat_enable(int cpu) { - while (!uv_cpu_hub_info(cpu)->scir.enabled) { - struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer; + while (!uv_cpu_scir_info(cpu)->enabled) { + struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer; uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); setup_timer(timer, uv_heartbeat, cpu); timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; add_timer_on(timer, cpu); - uv_cpu_hub_info(cpu)->scir.enabled = 1; + uv_cpu_scir_info(cpu)->enabled = 1; /* also ensure that boot cpu is enabled */ cpu = 0; @@ -777,9 +941,9 @@ static void uv_heartbeat_enable(int cpu) #ifdef CONFIG_HOTPLUG_CPU static void uv_heartbeat_disable(int cpu) { - if (uv_cpu_hub_info(cpu)->scir.enabled) { - uv_cpu_hub_info(cpu)->scir.enabled = 0; - del_timer(&uv_cpu_hub_info(cpu)->scir.timer); + if (uv_cpu_scir_info(cpu)->enabled) { + uv_cpu_scir_info(cpu)->enabled = 0; + del_timer(&uv_cpu_scir_info(cpu)->timer); } uv_set_cpu_scir_bits(cpu, 0xff); } @@ -862,155 +1026,475 @@ int uv_set_vga_state(struct pci_dev *pdev, bool decode, void uv_cpu_init(void) { /* CPU 0 initialization will be done via uv_system_init. */ - if (!uv_blade_info) + if (smp_processor_id() == 0) return; - uv_blade_info[uv_numa_blade_id()].nr_online_cpus++; + uv_hub_info->nr_online_cpus++; if (get_uv_system_type() == UV_NON_UNIQUE_APIC) set_x2apic_extra_bits(uv_hub_info->pnode); } -void __init uv_system_init(void) +struct mn { + unsigned char m_val; + unsigned char n_val; + unsigned char m_shift; + unsigned char n_lshift; +}; + +static void get_mn(struct mn *mnp) { - union uvh_rh_gam_config_mmr_u m_n_config; - union uvh_node_id_u node_id; - unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; - int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; - int gnode_extra, min_pnode = 999999, max_pnode = -1; - unsigned long mmr_base, present, paddr; - unsigned short pnode_mask; - unsigned char n_lshift; - char *hub = (is_uv1_hub() ? "UV100/1000" : - (is_uv2_hub() ? "UV2000/3000" : - (is_uv3_hub() ? "UV300" : NULL))); + union uvh_rh_gam_config_mmr_u m_n_config; + union uv3h_gr0_gam_gr_config_u m_gr_config; - if (!hub) { - pr_err("UV: Unknown/unsupported UV hub\n"); - return; + m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR); + mnp->n_val = m_n_config.s.n_skt; + if (is_uv4_hub()) { + mnp->m_val = 0; + mnp->n_lshift = 0; + } else if (is_uv3_hub()) { + mnp->m_val = m_n_config.s3.m_skt; + m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG); + mnp->n_lshift = m_gr_config.s3.m_skt; + } else if (is_uv2_hub()) { + mnp->m_val = m_n_config.s2.m_skt; + mnp->n_lshift = mnp->m_val == 40 ? 40 : 39; + } else if (is_uv1_hub()) { + mnp->m_val = m_n_config.s1.m_skt; + mnp->n_lshift = mnp->m_val; } - pr_info("UV: Found %s hub\n", hub); + mnp->m_shift = mnp->m_val ? 64 - mnp->m_val : 0; +} - map_low_mmrs(); +void __init uv_init_hub_info(struct uv_hub_info_s *hub_info) +{ + struct mn mn = {0}; /* avoid unitialized warnings */ + union uvh_node_id_u node_id; - m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR ); - m_val = m_n_config.s.m_skt; - n_val = m_n_config.s.n_skt; - pnode_mask = (1 << n_val) - 1; - n_lshift = get_n_lshift(m_val); - mmr_base = - uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & - ~UV_MMR_ENABLE; + get_mn(&mn); + hub_info->m_val = mn.m_val; + hub_info->n_val = mn.n_val; + hub_info->m_shift = mn.m_shift; + hub_info->n_lshift = mn.n_lshift ? mn.n_lshift : 0; + + hub_info->hub_revision = uv_hub_info->hub_revision; + hub_info->pnode_mask = uv_cpuid.pnode_mask; + hub_info->min_pnode = _min_pnode; + hub_info->min_socket = _min_socket; + hub_info->pnode_to_socket = _pnode_to_socket; + hub_info->socket_to_node = _socket_to_node; + hub_info->socket_to_pnode = _socket_to_pnode; + hub_info->gr_table_len = _gr_table_len; + hub_info->gr_table = _gr_table; + hub_info->gpa_mask = mn.m_val ? + (1UL << (mn.m_val + mn.n_val)) - 1 : + (1UL << uv_cpuid.gpa_shift) - 1; node_id.v = uv_read_local_mmr(UVH_NODE_ID); - gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; - gnode_upper = ((unsigned long)gnode_extra << m_val); - pr_info("UV: N:%d M:%d pnode_mask:0x%x gnode_upper/extra:0x%lx/0x%x n_lshift 0x%x\n", - n_val, m_val, pnode_mask, gnode_upper, gnode_extra, - n_lshift); + hub_info->gnode_extra = + (node_id.s.node_id & ~((1 << mn.n_val) - 1)) >> 1; + + hub_info->gnode_upper = + ((unsigned long)hub_info->gnode_extra << mn.m_val); + + if (uv_gp_table) { + hub_info->global_mmr_base = uv_gp_table->mmr_base; + hub_info->global_mmr_shift = uv_gp_table->mmr_shift; + hub_info->global_gru_base = uv_gp_table->gru_base; + hub_info->global_gru_shift = uv_gp_table->gru_shift; + hub_info->gpa_shift = uv_gp_table->gpa_shift; + hub_info->gpa_mask = (1UL << hub_info->gpa_shift) - 1; + } else { + hub_info->global_mmr_base = + uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & + ~UV_MMR_ENABLE; + hub_info->global_mmr_shift = _UV_GLOBAL_MMR64_PNODE_SHIFT; + } - pr_info("UV: global MMR base 0x%lx\n", mmr_base); + get_lowmem_redirect( + &hub_info->lowmem_remap_base, &hub_info->lowmem_remap_top); - for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) - uv_possible_blades += - hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8)); + hub_info->apic_pnode_shift = uv_cpuid.socketid_shift; - /* uv_num_possible_blades() is really the hub count */ - pr_info("UV: Found %d blades, %d hubs\n", - is_uv1_hub() ? uv_num_possible_blades() : - (uv_num_possible_blades() + 1) / 2, - uv_num_possible_blades()); + /* show system specific info */ + pr_info("UV: N:%d M:%d m_shift:%d n_lshift:%d\n", + hub_info->n_val, hub_info->m_val, + hub_info->m_shift, hub_info->n_lshift); - bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); - uv_blade_info = kzalloc(bytes, GFP_KERNEL); - BUG_ON(!uv_blade_info); + pr_info("UV: gpa_mask/shift:0x%lx/%d pnode_mask:0x%x apic_pns:%d\n", + hub_info->gpa_mask, hub_info->gpa_shift, + hub_info->pnode_mask, hub_info->apic_pnode_shift); - for (blade = 0; blade < uv_num_possible_blades(); blade++) - uv_blade_info[blade].memory_nid = -1; + pr_info("UV: mmr_base/shift:0x%lx/%ld gru_base/shift:0x%lx/%ld\n", + hub_info->global_mmr_base, hub_info->global_mmr_shift, + hub_info->global_gru_base, hub_info->global_gru_shift); - get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); + pr_info("UV: gnode_upper:0x%lx gnode_extra:0x%x\n", + hub_info->gnode_upper, hub_info->gnode_extra); +} + +static void __init decode_gam_params(unsigned long ptr) +{ + uv_gp_table = (struct uv_gam_parameters *)ptr; + + pr_info("UV: GAM Params...\n"); + pr_info("UV: mmr_base/shift:0x%llx/%d gru_base/shift:0x%llx/%d gpa_shift:%d\n", + uv_gp_table->mmr_base, uv_gp_table->mmr_shift, + uv_gp_table->gru_base, uv_gp_table->gru_shift, + uv_gp_table->gpa_shift); +} + +static void __init decode_gam_rng_tbl(unsigned long ptr) +{ + struct uv_gam_range_entry *gre = (struct uv_gam_range_entry *)ptr; + unsigned long lgre = 0; + int index = 0; + int sock_min = 999999, pnode_min = 99999; + int sock_max = -1, pnode_max = -1; + + uv_gre_table = gre; + for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { + if (!index) { + pr_info("UV: GAM Range Table...\n"); + pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s %3s\n", + "Range", "", "Size", "Type", "NASID", + "SID", "PN", "PXM"); + } + pr_info( + "UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x %3d\n", + index++, + (unsigned long)lgre << UV_GAM_RANGE_SHFT, + (unsigned long)gre->limit << UV_GAM_RANGE_SHFT, + ((unsigned long)(gre->limit - lgre)) >> + (30 - UV_GAM_RANGE_SHFT), /* 64M -> 1G */ + gre->type, gre->nasid, gre->sockid, + gre->pnode, gre->pxm); + + lgre = gre->limit; + if (sock_min > gre->sockid) + sock_min = gre->sockid; + if (sock_max < gre->sockid) + sock_max = gre->sockid; + if (pnode_min > gre->pnode) + pnode_min = gre->pnode; + if (pnode_max < gre->pnode) + pnode_max = gre->pnode; + } + _min_socket = sock_min; + _max_socket = sock_max; + _min_pnode = pnode_min; + _max_pnode = pnode_max; + _gr_table_len = index; + pr_info( + "UV: GRT: %d entries, sockets(min:%x,max:%x) pnodes(min:%x,max:%x)\n", + index, _min_socket, _max_socket, _min_pnode, _max_pnode); +} + +static void __init decode_uv_systab(void) +{ + struct uv_systab *st; + int i; + + st = uv_systab; + if ((!st || st->revision < UV_SYSTAB_VERSION_UV4) && !is_uv4_hub()) + return; + if (st->revision != UV_SYSTAB_VERSION_UV4_LATEST) { + pr_crit( + "UV: BIOS UVsystab version(%x) mismatch, expecting(%x)\n", + st->revision, UV_SYSTAB_VERSION_UV4_LATEST); + BUG(); + } + + for (i = 0; st->entry[i].type != UV_SYSTAB_TYPE_UNUSED; i++) { + unsigned long ptr = st->entry[i].offset; - bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes(); - uv_node_to_blade = kmalloc(bytes, GFP_KERNEL); - BUG_ON(!uv_node_to_blade); - memset(uv_node_to_blade, 255, bytes); + if (!ptr) + continue; + + ptr = ptr + (unsigned long)st; + + switch (st->entry[i].type) { + case UV_SYSTAB_TYPE_GAM_PARAMS: + decode_gam_params(ptr); + break; + + case UV_SYSTAB_TYPE_GAM_RNG_TBL: + decode_gam_rng_tbl(ptr); + break; + } + } +} - bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus(); - uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL); - BUG_ON(!uv_cpu_to_blade); - memset(uv_cpu_to_blade, 255, bytes); +/* + * Setup physical blade translations from UVH_NODE_PRESENT_TABLE + * .. NB: UVH_NODE_PRESENT_TABLE is going away, + * .. being replaced by GAM Range Table + */ +static __init void boot_init_possible_blades(struct uv_hub_info_s *hub_info) +{ + int i, uv_pb = 0; - blade = 0; + pr_info("UV: NODE_PRESENT_DEPTH = %d\n", UVH_NODE_PRESENT_TABLE_DEPTH); for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) { - present = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8); - for (j = 0; j < 64; j++) { - if (!test_bit(j, &present)) - continue; - pnode = (i * 64 + j) & pnode_mask; - uv_blade_info[blade].pnode = pnode; - uv_blade_info[blade].nr_possible_cpus = 0; - uv_blade_info[blade].nr_online_cpus = 0; - spin_lock_init(&uv_blade_info[blade].nmi_lock); - min_pnode = min(pnode, min_pnode); - max_pnode = max(pnode, max_pnode); - blade++; + unsigned long np; + + np = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8); + if (np) + pr_info("UV: NODE_PRESENT(%d) = 0x%016lx\n", i, np); + + uv_pb += hweight64(np); + } + if (uv_possible_blades != uv_pb) + uv_possible_blades = uv_pb; +} + +static void __init build_socket_tables(void) +{ + struct uv_gam_range_entry *gre = uv_gre_table; + int num, nump; + int cpu, i, lnid; + int minsock = _min_socket; + int maxsock = _max_socket; + int minpnode = _min_pnode; + int maxpnode = _max_pnode; + size_t bytes; + + if (!gre) { + if (is_uv1_hub() || is_uv2_hub() || is_uv3_hub()) { + pr_info("UV: No UVsystab socket table, ignoring\n"); + return; /* not required */ } + pr_crit( + "UV: Error: UVsystab address translations not available!\n"); + BUG(); + } + + /* build socket id -> node id, pnode */ + num = maxsock - minsock + 1; + bytes = num * sizeof(_socket_to_node[0]); + _socket_to_node = kmalloc(bytes, GFP_KERNEL); + _socket_to_pnode = kmalloc(bytes, GFP_KERNEL); + + nump = maxpnode - minpnode + 1; + bytes = nump * sizeof(_pnode_to_socket[0]); + _pnode_to_socket = kmalloc(bytes, GFP_KERNEL); + BUG_ON(!_socket_to_node || !_socket_to_pnode || !_pnode_to_socket); + + for (i = 0; i < num; i++) + _socket_to_node[i] = _socket_to_pnode[i] = SOCK_EMPTY; + + for (i = 0; i < nump; i++) + _pnode_to_socket[i] = SOCK_EMPTY; + + /* fill in pnode/node/addr conversion list values */ + pr_info("UV: GAM Building socket/pnode/pxm conversion tables\n"); + for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { + if (gre->type == UV_GAM_RANGE_TYPE_HOLE) + continue; + i = gre->sockid - minsock; + if (_socket_to_pnode[i] != SOCK_EMPTY) + continue; /* duplicate */ + _socket_to_pnode[i] = gre->pnode; + _socket_to_node[i] = gre->pxm; + + i = gre->pnode - minpnode; + _pnode_to_socket[i] = gre->sockid; + + pr_info( + "UV: sid:%02x type:%d nasid:%04x pn:%02x pxm:%2d pn2s:%2x\n", + gre->sockid, gre->type, gre->nasid, + _socket_to_pnode[gre->sockid - minsock], + _socket_to_node[gre->sockid - minsock], + _pnode_to_socket[gre->pnode - minpnode]); } - uv_bios_init(); + /* check socket -> node values */ + lnid = -1; + for_each_present_cpu(cpu) { + int nid = cpu_to_node(cpu); + int apicid, sockid; + + if (lnid == nid) + continue; + lnid = nid; + apicid = per_cpu(x86_cpu_to_apicid, cpu); + sockid = apicid >> uv_cpuid.socketid_shift; + i = sockid - minsock; + + if (nid != _socket_to_node[i]) { + pr_warn( + "UV: %02x: type:%d socket:%02x PXM:%02x != node:%2d\n", + i, sockid, gre->type, _socket_to_node[i], nid); + _socket_to_node[i] = nid; + } + } + + /* Setup physical blade to pnode translation from GAM Range Table */ + bytes = num_possible_nodes() * sizeof(_node_to_pnode[0]); + _node_to_pnode = kmalloc(bytes, GFP_KERNEL); + BUG_ON(!_node_to_pnode); + + for (lnid = 0; lnid < num_possible_nodes(); lnid++) { + unsigned short sockid; + + for (sockid = minsock; sockid <= maxsock; sockid++) { + if (lnid == _socket_to_node[sockid - minsock]) { + _node_to_pnode[lnid] = + _socket_to_pnode[sockid - minsock]; + break; + } + } + if (sockid > maxsock) { + pr_err("UV: socket for node %d not found!\n", lnid); + BUG(); + } + } + + /* + * If socket id == pnode or socket id == node for all nodes, + * system runs faster by removing corresponding conversion table. + */ + pr_info("UV: Checking socket->node/pnode for identity maps\n"); + if (minsock == 0) { + for (i = 0; i < num; i++) + if (_socket_to_node[i] == SOCK_EMPTY || + i != _socket_to_node[i]) + break; + if (i >= num) { + kfree(_socket_to_node); + _socket_to_node = NULL; + pr_info("UV: 1:1 socket_to_node table removed\n"); + } + } + if (minsock == minpnode) { + for (i = 0; i < num; i++) + if (_socket_to_pnode[i] != SOCK_EMPTY && + _socket_to_pnode[i] != i + minpnode) + break; + if (i >= num) { + kfree(_socket_to_pnode); + _socket_to_pnode = NULL; + pr_info("UV: 1:1 socket_to_pnode table removed\n"); + } + } +} + +void __init uv_system_init(void) +{ + struct uv_hub_info_s hub_info = {0}; + int bytes, cpu, nodeid; + unsigned short min_pnode = 9999, max_pnode = 0; + char *hub = is_uv4_hub() ? "UV400" : + is_uv3_hub() ? "UV300" : + is_uv2_hub() ? "UV2000/3000" : + is_uv1_hub() ? "UV100/1000" : NULL; + + if (!hub) { + pr_err("UV: Unknown/unsupported UV hub\n"); + return; + } + pr_info("UV: Found %s hub\n", hub); + + map_low_mmrs(); + + uv_bios_init(); /* get uv_systab for decoding */ + decode_uv_systab(); + build_socket_tables(); + build_uv_gr_table(); + uv_init_hub_info(&hub_info); + uv_possible_blades = num_possible_nodes(); + if (!_node_to_pnode) + boot_init_possible_blades(&hub_info); + + /* uv_num_possible_blades() is really the hub count */ + pr_info("UV: Found %d hubs, %d nodes, %d cpus\n", + uv_num_possible_blades(), + num_possible_nodes(), + num_possible_cpus()); + uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, &sn_region_size, &system_serial_number); + hub_info.coherency_domain_number = sn_coherency_id; uv_rtc_init(); - for_each_present_cpu(cpu) { - int apicid = per_cpu(x86_cpu_to_apicid, cpu); + bytes = sizeof(void *) * uv_num_possible_blades(); + __uv_hub_info_list = kzalloc(bytes, GFP_KERNEL); + BUG_ON(!__uv_hub_info_list); - nid = cpu_to_node(cpu); - /* - * apic_pnode_shift must be set before calling uv_apicid_to_pnode(); - */ - uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; - uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift; - uv_cpu_hub_info(cpu)->hub_revision = uv_hub_info->hub_revision; + bytes = sizeof(struct uv_hub_info_s); + for_each_node(nodeid) { + struct uv_hub_info_s *new_hub; - uv_cpu_hub_info(cpu)->m_shift = 64 - m_val; - uv_cpu_hub_info(cpu)->n_lshift = n_lshift; + if (__uv_hub_info_list[nodeid]) { + pr_err("UV: Node %d UV HUB already initialized!?\n", + nodeid); + BUG(); + } + + /* Allocate new per hub info list */ + new_hub = (nodeid == 0) ? + &uv_hub_info_node0 : + kzalloc_node(bytes, GFP_KERNEL, nodeid); + BUG_ON(!new_hub); + __uv_hub_info_list[nodeid] = new_hub; + new_hub = uv_hub_info_list(nodeid); + BUG_ON(!new_hub); + *new_hub = hub_info; + + /* Use information from GAM table if available */ + if (_node_to_pnode) + new_hub->pnode = _node_to_pnode[nodeid]; + else /* Fill in during cpu loop */ + new_hub->pnode = 0xffff; + new_hub->numa_blade_id = uv_node_to_blade_id(nodeid); + new_hub->memory_nid = -1; + new_hub->nr_possible_cpus = 0; + new_hub->nr_online_cpus = 0; + } + /* Initialize per cpu info */ + for_each_possible_cpu(cpu) { + int apicid = per_cpu(x86_cpu_to_apicid, cpu); + int numa_node_id; + unsigned short pnode; + + nodeid = cpu_to_node(cpu); + numa_node_id = numa_cpu_node(cpu); pnode = uv_apicid_to_pnode(apicid); - blade = boot_pnode_to_blade(pnode); - lcpu = uv_blade_info[blade].nr_possible_cpus; - uv_blade_info[blade].nr_possible_cpus++; - - /* Any node on the blade, else will contain -1. */ - uv_blade_info[blade].memory_nid = nid; - - uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; - uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; - uv_cpu_hub_info(cpu)->m_val = m_val; - uv_cpu_hub_info(cpu)->n_val = n_val; - uv_cpu_hub_info(cpu)->numa_blade_id = blade; - uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; - uv_cpu_hub_info(cpu)->pnode = pnode; - uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1; - uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; - uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; - uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; - uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; - uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid); - uv_node_to_blade[nid] = blade; - uv_cpu_to_blade[cpu] = blade; + + uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list(nodeid); + uv_cpu_info_per(cpu)->blade_cpu_id = + uv_cpu_hub_info(cpu)->nr_possible_cpus++; + if (uv_cpu_hub_info(cpu)->memory_nid == -1) + uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu); + if (nodeid != numa_node_id && /* init memoryless node */ + uv_hub_info_list(numa_node_id)->pnode == 0xffff) + uv_hub_info_list(numa_node_id)->pnode = pnode; + else if (uv_cpu_hub_info(cpu)->pnode == 0xffff) + uv_cpu_hub_info(cpu)->pnode = pnode; + uv_cpu_scir_info(cpu)->offset = uv_scir_offset(apicid); } - /* Add blade/pnode info for nodes without cpus */ - for_each_online_node(nid) { - if (uv_node_to_blade[nid] >= 0) - continue; - paddr = node_start_pfn(nid) << PAGE_SHIFT; - pnode = uv_gpa_to_pnode(uv_soc_phys_ram_to_gpa(paddr)); - blade = boot_pnode_to_blade(pnode); - uv_node_to_blade[nid] = blade; + for_each_node(nodeid) { + unsigned short pnode = uv_hub_info_list(nodeid)->pnode; + + /* Add pnode info for pre-GAM list nodes without cpus */ + if (pnode == 0xffff) { + unsigned long paddr; + + paddr = node_start_pfn(nodeid) << PAGE_SHIFT; + pnode = uv_gpa_to_pnode(uv_soc_phys_ram_to_gpa(paddr)); + uv_hub_info_list(nodeid)->pnode = pnode; + } + min_pnode = min(pnode, min_pnode); + max_pnode = max(pnode, max_pnode); + pr_info("UV: UVHUB node:%2d pn:%02x nrcpus:%d\n", + nodeid, + uv_hub_info_list(nodeid)->pnode, + uv_hub_info_list(nodeid)->nr_possible_cpus); } + pr_info("UV: min_pnode:%02x max_pnode:%02x\n", min_pnode, max_pnode); map_gru_high(max_pnode); map_mmr_high(max_pnode); map_mmioh_high(min_pnode, max_pnode); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 9307f182fe30..c7364bd633e1 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -2267,7 +2267,7 @@ static int __init apm_init(void) dmi_check_system(apm_dmi_table); - if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) { + if (apm_info.bios.version == 0 || machine_is_olpc()) { printk(KERN_INFO "apm: BIOS not found.\n"); return -ENODEV; } diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 5c042466f274..674134e9f5e5 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -80,6 +80,7 @@ void common(void) { OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); + OFFSET(BP_init_size, boot_params, hdr.init_size); OFFSET(BP_pref_address, boot_params, hdr.pref_address); OFFSET(BP_code32_start, boot_params, hdr.code32_start); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7b76eb67a9b3..c343a54bed39 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -565,14 +565,17 @@ static void early_init_amd(struct cpuinfo_x86 *c) * can safely set X86_FEATURE_EXTD_APICID unconditionally for families * after 16h. */ - if (cpu_has_apic && c->x86 > 0x16) { - set_cpu_cap(c, X86_FEATURE_EXTD_APICID); - } else if (cpu_has_apic && c->x86 >= 0xf) { - /* check CPU config space for extended APIC ID */ - unsigned int val; - val = read_pci_config(0, 24, 0, 0x68); - if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18))) + if (boot_cpu_has(X86_FEATURE_APIC)) { + if (c->x86 > 0x16) set_cpu_cap(c, X86_FEATURE_EXTD_APICID); + else if (c->x86 >= 0xf) { + /* check CPU config space for extended APIC ID */ + unsigned int val; + + val = read_pci_config(0, 24, 0, 0x68); + if ((val >> 17 & 0x3) == 0x3) + set_cpu_cap(c, X86_FEATURE_EXTD_APICID); + } } #endif @@ -628,6 +631,7 @@ static void init_amd_k8(struct cpuinfo_x86 *c) */ msr_set_bit(MSR_K7_HWCR, 6); #endif + set_cpu_bug(c, X86_BUG_SWAPGS_FENCE); } static void init_amd_gh(struct cpuinfo_x86 *c) @@ -746,7 +750,7 @@ static void init_amd(struct cpuinfo_x86 *c) if (c->x86 >= 0xf) set_cpu_cap(c, X86_FEATURE_K8); - if (cpu_has_xmm2) { + if (cpu_has(c, X86_FEATURE_XMM2)) { /* MFENCE stops RDTSC speculation */ set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 62ff5255ae16..0fe6953f421c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -437,7 +437,7 @@ void load_percpu_segment(int cpu) #ifdef CONFIG_X86_32 loadsegment(fs, __KERNEL_PERCPU); #else - loadsegment(gs, 0); + __loadsegment_simple(gs, 0); wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu)); #endif load_stack_canary_segment(); @@ -724,6 +724,13 @@ void get_cpu_cap(struct cpuinfo_x86 *c) } } + if (c->extended_cpuid_level >= 0x80000007) { + cpuid(0x80000007, &eax, &ebx, &ecx, &edx); + + c->x86_capability[CPUID_8000_0007_EBX] = ebx; + c->x86_power = edx; + } + if (c->extended_cpuid_level >= 0x80000008) { cpuid(0x80000008, &eax, &ebx, &ecx, &edx); @@ -736,9 +743,6 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_phys_bits = 36; #endif - if (c->extended_cpuid_level >= 0x80000007) - c->x86_power = cpuid_edx(0x80000007); - if (c->extended_cpuid_level >= 0x8000000a) c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); @@ -869,30 +873,34 @@ static void detect_nopl(struct cpuinfo_x86 *c) #else set_cpu_cap(c, X86_FEATURE_NOPL); #endif +} +static void detect_null_seg_behavior(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_64 /* - * ESPFIX is a strange bug. All real CPUs have it. Paravirt - * systems that run Linux at CPL > 0 may or may not have the - * issue, but, even if they have the issue, there's absolutely - * nothing we can do about it because we can't use the real IRET - * instruction. + * Empirically, writing zero to a segment selector on AMD does + * not clear the base, whereas writing zero to a segment + * selector on Intel does clear the base. Intel's behavior + * allows slightly faster context switches in the common case + * where GS is unused by the prev and next threads. * - * NB: For the time being, only 32-bit kernels support - * X86_BUG_ESPFIX as such. 64-bit kernels directly choose - * whether to apply espfix using paravirt hooks. If any - * non-paravirt system ever shows up that does *not* have the - * ESPFIX issue, we can change this. + * Since neither vendor documents this anywhere that I can see, + * detect it directly instead of hardcoding the choice by + * vendor. + * + * I've designated AMD's behavior as the "bug" because it's + * counterintuitive and less friendly. */ -#ifdef CONFIG_X86_32 -#ifdef CONFIG_PARAVIRT - do { - extern void native_iret(void); - if (pv_cpu_ops.iret == native_iret) - set_cpu_bug(c, X86_BUG_ESPFIX); - } while (0); -#else - set_cpu_bug(c, X86_BUG_ESPFIX); -#endif + + unsigned long old_base, tmp; + rdmsrl(MSR_FS_BASE, old_base); + wrmsrl(MSR_FS_BASE, 1); + loadsegment(fs, 0); + rdmsrl(MSR_FS_BASE, tmp); + if (tmp != 0) + set_cpu_bug(c, X86_BUG_NULL_SEG); + wrmsrl(MSR_FS_BASE, old_base); #endif } @@ -928,6 +936,33 @@ static void generic_identify(struct cpuinfo_x86 *c) get_model_name(c); /* Default name */ detect_nopl(c); + + detect_null_seg_behavior(c); + + /* + * ESPFIX is a strange bug. All real CPUs have it. Paravirt + * systems that run Linux at CPL > 0 may or may not have the + * issue, but, even if they have the issue, there's absolutely + * nothing we can do about it because we can't use the real IRET + * instruction. + * + * NB: For the time being, only 32-bit kernels support + * X86_BUG_ESPFIX as such. 64-bit kernels directly choose + * whether to apply espfix using paravirt hooks. If any + * non-paravirt system ever shows up that does *not* have the + * ESPFIX issue, we can change this. + */ +#ifdef CONFIG_X86_32 +# ifdef CONFIG_PARAVIRT + do { + extern void native_iret(void); + if (pv_cpu_ops.iret == native_iret) + set_cpu_bug(c, X86_BUG_ESPFIX); + } while (0); +# else + set_cpu_bug(c, X86_BUG_ESPFIX); +# endif +#endif } static void x86_init_cache_qos(struct cpuinfo_x86 *c) @@ -1083,12 +1118,12 @@ void enable_sep_cpu(void) struct tss_struct *tss; int cpu; + if (!boot_cpu_has(X86_FEATURE_SEP)) + return; + cpu = get_cpu(); tss = &per_cpu(cpu_tss, cpu); - if (!boot_cpu_has(X86_FEATURE_SEP)) - goto out; - /* * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- * see the big comment in struct x86_hw_tss's definition. @@ -1103,7 +1138,6 @@ void enable_sep_cpu(void) wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); -out: put_cpu(); } #endif @@ -1535,7 +1569,7 @@ void cpu_init(void) pr_info("Initializing CPU#%d\n", cpu); if (cpu_feature_enabled(X86_FEATURE_VME) || - cpu_has_tsc || + boot_cpu_has(X86_FEATURE_TSC) || boot_cpu_has(X86_FEATURE_DE)) cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 6adef9cac23e..bd9dcd6b712d 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -333,7 +333,7 @@ static void init_cyrix(struct cpuinfo_x86 *c) switch (dir0_lsn) { case 0xd: /* either a 486SLC or DLC w/o DEVID */ dir0_msn = 0; - p = Cx486_name[(cpu_has_fpu ? 1 : 0)]; + p = Cx486_name[!!boot_cpu_has(X86_FEATURE_FPU)]; break; case 0xe: /* a 486S A step */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index b47df99dc5d2..6e2ffbebbcdb 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -187,9 +187,9 @@ static void early_init_intel(struct cpuinfo_x86 *c) * the TLB when any changes are made to any of the page table entries. * The operating system must reload CR3 to cause the TLB to be flushed" * - * As a result cpu_has_pge() in arch/x86/include/asm/tlbflush.h should - * be false so that __flush_tlb_all() causes CR3 insted of CR4.PGE - * to be modified + * As a result, boot_cpu_has(X86_FEATURE_PGE) in arch/x86/include/asm/tlbflush.h + * should be false so that __flush_tlb_all() causes CR3 insted of CR4.PGE + * to be modified. */ if (c->x86 == 5 && c->x86_model == 9) { pr_info("Disabling PGE capability bit\n"); @@ -270,7 +270,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * The Quark is also family 5, but does not have the same bug. */ clear_cpu_bug(c, X86_BUG_F00F); - if (!paravirt_enabled() && c->x86 == 5 && c->x86_model < 9) { + if (c->x86 == 5 && c->x86_model < 9) { static int f00f_workaround_enabled; set_cpu_bug(c, X86_BUG_F00F); @@ -318,7 +318,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * integrated APIC (see 11AP erratum in "Pentium Processor * Specification Update"). */ - if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 && + if (boot_cpu_has(X86_FEATURE_APIC) && (c->x86<<8 | c->x86_model<<4) == 0x520 && (c->x86_mask < 0x6 || c->x86_mask == 0xb)) set_cpu_bug(c, X86_BUG_11AP); @@ -493,7 +493,7 @@ static void init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); } - if (cpu_has_xmm2) + if (cpu_has(c, X86_FEATURE_XMM2)) set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); if (boot_cpu_has(X86_FEATURE_DS)) { @@ -505,7 +505,7 @@ static void init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_PEBS); } - if (c->x86 == 6 && cpu_has_clflush && + if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_CLFLUSH) && (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47)) set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR); diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c index 2658e2af74ec..93d824ec3120 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-genpool.c +++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c @@ -26,6 +26,52 @@ static struct gen_pool *mce_evt_pool; static LLIST_HEAD(mce_event_llist); static char gen_pool_buf[MCE_POOLSZ]; +/* + * Compare the record "t" with each of the records on list "l" to see if + * an equivalent one is present in the list. + */ +static bool is_duplicate_mce_record(struct mce_evt_llist *t, struct mce_evt_llist *l) +{ + struct mce_evt_llist *node; + struct mce *m1, *m2; + + m1 = &t->mce; + + llist_for_each_entry(node, &l->llnode, llnode) { + m2 = &node->mce; + + if (!mce_cmp(m1, m2)) + return true; + } + return false; +} + +/* + * The system has panicked - we'd like to peruse the list of MCE records + * that have been queued, but not seen by anyone yet. The list is in + * reverse time order, so we need to reverse it. While doing that we can + * also drop duplicate records (these were logged because some banks are + * shared between cores or by all threads on a socket). + */ +struct llist_node *mce_gen_pool_prepare_records(void) +{ + struct llist_node *head; + LLIST_HEAD(new_head); + struct mce_evt_llist *node, *t; + + head = llist_del_all(&mce_event_llist); + if (!head) + return NULL; + + /* squeeze out duplicates while reversing order */ + llist_for_each_entry_safe(node, t, head, llnode) { + if (!is_duplicate_mce_record(node, t)) + llist_add(&node->llnode, &new_head); + } + + return new_head.first; +} + void mce_gen_pool_process(void) { struct llist_node *head; diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 547720efd923..cd74a3f00aea 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -35,6 +35,7 @@ void mce_gen_pool_process(void); bool mce_gen_pool_empty(void); int mce_gen_pool_add(struct mce *mce); int mce_gen_pool_init(void); +struct llist_node *mce_gen_pool_prepare_records(void); extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp); struct dentry *mce_get_debugfs_dir(void); @@ -81,3 +82,17 @@ static inline int apei_clear_mce(u64 record_id) #endif void mce_inject_log(struct mce *m); + +/* + * We consider records to be equivalent if bank+status+addr+misc all match. + * This is only used when the system is going down because of a fatal error + * to avoid cluttering the console log with essentially repeated information. + * In normal processing all errors seen are logged. + */ +static inline bool mce_cmp(struct mce *m1, struct mce *m2) +{ + return m1->bank != m2->bank || + m1->status != m2->status || + m1->addr != m2->addr || + m1->misc != m2->misc; +} diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 5119766d9889..631356c8cca4 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -204,6 +204,33 @@ static int error_context(struct mce *m) return IN_KERNEL; } +static int mce_severity_amd_smca(struct mce *m, int err_ctx) +{ + u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); + u32 low, high; + + /* + * We need to look at the following bits: + * - "succor" bit (data poisoning support), and + * - TCC bit (Task Context Corrupt) + * in MCi_STATUS to determine error severity. + */ + if (!mce_flags.succor) + return MCE_PANIC_SEVERITY; + + if (rdmsr_safe(addr, &low, &high)) + return MCE_PANIC_SEVERITY; + + /* TCC (Task context corrupt). If set and if IN_KERNEL, panic. */ + if ((low & MCI_CONFIG_MCAX) && + (m->status & MCI_STATUS_TCC) && + (err_ctx == IN_KERNEL)) + return MCE_PANIC_SEVERITY; + + /* ...otherwise invoke hwpoison handler. */ + return MCE_AR_SEVERITY; +} + /* * See AMD Error Scope Hierarchy table in a newer BKDG. For example * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features" @@ -225,6 +252,9 @@ static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_exc * to at least kill process to prolong system operation. */ if (mce_flags.overflow_recov) { + if (mce_flags.smca) + return mce_severity_amd_smca(m, ctx); + /* software can try to contain */ if (!(m->mcgstatus & MCG_STATUS_RIPV) && (ctx == IN_KERNEL)) return MCE_PANIC_SEVERITY; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index f0c921b03e42..92e5e37d97bf 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -161,7 +161,6 @@ void mce_log(struct mce *mce) if (!mce_gen_pool_add(mce)) irq_work_queue(&mce_irq_work); - mce->finished = 0; wmb(); for (;;) { entry = mce_log_get_idx_check(mcelog.next); @@ -194,7 +193,6 @@ void mce_log(struct mce *mce) mcelog.entry[entry].finished = 1; wmb(); - mce->finished = 1; set_bit(0, &mce_need_notify); } @@ -224,6 +222,53 @@ void mce_unregister_decode_chain(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); +static inline u32 ctl_reg(int bank) +{ + return MSR_IA32_MCx_CTL(bank); +} + +static inline u32 status_reg(int bank) +{ + return MSR_IA32_MCx_STATUS(bank); +} + +static inline u32 addr_reg(int bank) +{ + return MSR_IA32_MCx_ADDR(bank); +} + +static inline u32 misc_reg(int bank) +{ + return MSR_IA32_MCx_MISC(bank); +} + +static inline u32 smca_ctl_reg(int bank) +{ + return MSR_AMD64_SMCA_MCx_CTL(bank); +} + +static inline u32 smca_status_reg(int bank) +{ + return MSR_AMD64_SMCA_MCx_STATUS(bank); +} + +static inline u32 smca_addr_reg(int bank) +{ + return MSR_AMD64_SMCA_MCx_ADDR(bank); +} + +static inline u32 smca_misc_reg(int bank) +{ + return MSR_AMD64_SMCA_MCx_MISC(bank); +} + +struct mca_msr_regs msr_ops = { + .ctl = ctl_reg, + .status = status_reg, + .addr = addr_reg, + .misc = misc_reg +}; + static void print_mce(struct mce *m) { int ret = 0; @@ -290,7 +335,9 @@ static void wait_for_panic(void) static void mce_panic(const char *msg, struct mce *final, char *exp) { - int i, apei_err = 0; + int apei_err = 0; + struct llist_node *pending; + struct mce_evt_llist *l; if (!fake_panic) { /* @@ -307,11 +354,10 @@ static void mce_panic(const char *msg, struct mce *final, char *exp) if (atomic_inc_return(&mce_fake_panicked) > 1) return; } + pending = mce_gen_pool_prepare_records(); /* First print corrected ones that are still unlogged */ - for (i = 0; i < MCE_LOG_LEN; i++) { - struct mce *m = &mcelog.entry[i]; - if (!(m->status & MCI_STATUS_VAL)) - continue; + llist_for_each_entry(l, pending, llnode) { + struct mce *m = &l->mce; if (!(m->status & MCI_STATUS_UC)) { print_mce(m); if (!apei_err) @@ -319,13 +365,11 @@ static void mce_panic(const char *msg, struct mce *final, char *exp) } } /* Now print uncorrected but with the final one last */ - for (i = 0; i < MCE_LOG_LEN; i++) { - struct mce *m = &mcelog.entry[i]; - if (!(m->status & MCI_STATUS_VAL)) - continue; + llist_for_each_entry(l, pending, llnode) { + struct mce *m = &l->mce; if (!(m->status & MCI_STATUS_UC)) continue; - if (!final || memcmp(m, final, sizeof(struct mce))) { + if (!final || mce_cmp(m, final)) { print_mce(m); if (!apei_err) apei_err = apei_write_mce(m); @@ -356,11 +400,11 @@ static int msr_to_offset(u32 msr) if (msr == mca_cfg.rip_msr) return offsetof(struct mce, ip); - if (msr == MSR_IA32_MCx_STATUS(bank)) + if (msr == msr_ops.status(bank)) return offsetof(struct mce, status); - if (msr == MSR_IA32_MCx_ADDR(bank)) + if (msr == msr_ops.addr(bank)) return offsetof(struct mce, addr); - if (msr == MSR_IA32_MCx_MISC(bank)) + if (msr == msr_ops.misc(bank)) return offsetof(struct mce, misc); if (msr == MSR_IA32_MCG_STATUS) return offsetof(struct mce, mcgstatus); @@ -523,9 +567,9 @@ static struct notifier_block mce_srao_nb = { static void mce_read_aux(struct mce *m, int i) { if (m->status & MCI_STATUS_MISCV) - m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); + m->misc = mce_rdmsrl(msr_ops.misc(i)); if (m->status & MCI_STATUS_ADDRV) { - m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); + m->addr = mce_rdmsrl(msr_ops.addr(i)); /* * Mask the reported address by the reported granularity. @@ -607,7 +651,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.tsc = 0; barrier(); - m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); + m.status = mce_rdmsrl(msr_ops.status(i)); if (!(m.status & MCI_STATUS_VAL)) continue; @@ -654,7 +698,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) /* * Clear state for this bank. */ - mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); + mce_wrmsrl(msr_ops.status(i), 0); } /* @@ -679,7 +723,7 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, char *tmp; for (i = 0; i < mca_cfg.banks; i++) { - m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); + m->status = mce_rdmsrl(msr_ops.status(i)); if (m->status & MCI_STATUS_VAL) { __set_bit(i, validp); if (quirk_no_way_out) @@ -830,9 +874,9 @@ static int mce_start(int *no_way_out) atomic_add(*no_way_out, &global_nwo); /* - * global_nwo should be updated before mce_callin + * Rely on the implied barrier below, such that global_nwo + * is updated before mce_callin. */ - smp_wmb(); order = atomic_inc_return(&mce_callin); /* @@ -957,7 +1001,7 @@ static void mce_clear_state(unsigned long *toclear) for (i = 0; i < mca_cfg.banks; i++) { if (test_bit(i, toclear)) - mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); + mce_wrmsrl(msr_ops.status(i), 0); } } @@ -994,11 +1038,12 @@ void do_machine_check(struct pt_regs *regs, long error_code) int i; int worst = 0; int severity; + /* * Establish sequential order between the CPUs entering the machine * check handler. */ - int order; + int order = -1; /* * If no_way_out gets set, there is no safe way to recover from this * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway. @@ -1012,7 +1057,12 @@ void do_machine_check(struct pt_regs *regs, long error_code) DECLARE_BITMAP(toclear, MAX_NR_BANKS); DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); char *msg = "Unknown"; - int lmce = 0; + + /* + * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES + * on Intel. + */ + int lmce = 1; /* If this CPU is offline, just bail out. */ if (cpu_is_offline(smp_processor_id())) { @@ -1051,19 +1101,20 @@ void do_machine_check(struct pt_regs *regs, long error_code) kill_it = 1; /* - * Check if this MCE is signaled to only this logical processor + * Check if this MCE is signaled to only this logical processor, + * on Intel only. */ - if (m.mcgstatus & MCG_STATUS_LMCES) - lmce = 1; - else { - /* - * Go through all the banks in exclusion of the other CPUs. - * This way we don't report duplicated events on shared banks - * because the first one to see it will clear it. - * If this is a Local MCE, then no need to perform rendezvous. - */ + if (m.cpuvendor == X86_VENDOR_INTEL) + lmce = m.mcgstatus & MCG_STATUS_LMCES; + + /* + * Go through all banks in exclusion of the other CPUs. This way we + * don't report duplicated events on shared banks because the first one + * to see it will clear it. If this is a Local MCE, then no need to + * perform rendezvous. + */ + if (!lmce) order = mce_start(&no_way_out); - } for (i = 0; i < cfg->banks; i++) { __clear_bit(i, toclear); @@ -1076,7 +1127,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) m.addr = 0; m.bank = i; - m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); + m.status = mce_rdmsrl(msr_ops.status(i)); if ((m.status & MCI_STATUS_VAL) == 0) continue; @@ -1420,7 +1471,6 @@ static void __mcheck_cpu_init_generic(void) enum mcp_flags m_fl = 0; mce_banks_t all_banks; u64 cap; - int i; if (!mca_cfg.bootlog) m_fl = MCP_DONTLOG; @@ -1436,14 +1486,19 @@ static void __mcheck_cpu_init_generic(void) rdmsrl(MSR_IA32_MCG_CAP, cap); if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); +} + +static void __mcheck_cpu_init_clear_banks(void) +{ + int i; for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; if (!b->init) continue; - wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); - wrmsrl(MSR_IA32_MCx_STATUS(i), 0); + wrmsrl(msr_ops.ctl(i), b->ctl); + wrmsrl(msr_ops.status(i), 0); } } @@ -1495,7 +1550,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) */ clear_bit(10, (unsigned long *)&mce_banks[4].ctl); } - if (c->x86 <= 17 && cfg->bootlog < 0) { + if (c->x86 < 17 && cfg->bootlog < 0) { /* * Lots of broken BIOS around that don't clear them * by default and leave crap in there. Don't log: @@ -1628,11 +1683,19 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) break; case X86_VENDOR_AMD: { - u32 ebx = cpuid_ebx(0x80000007); + mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV); + mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); + mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); - mce_flags.overflow_recov = !!(ebx & BIT(0)); - mce_flags.succor = !!(ebx & BIT(1)); - mce_flags.smca = !!(ebx & BIT(3)); + /* + * Install proper ops for Scalable MCA enabled processors + */ + if (mce_flags.smca) { + msr_ops.ctl = smca_ctl_reg; + msr_ops.status = smca_status_reg; + msr_ops.addr = smca_addr_reg; + msr_ops.misc = smca_misc_reg; + } mce_amd_feature_init(c); break; @@ -1717,6 +1780,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(c); + __mcheck_cpu_init_clear_banks(); __mcheck_cpu_init_timer(); } @@ -2082,7 +2146,7 @@ static void mce_disable_error_reporting(void) struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(MSR_IA32_MCx_CTL(i), 0); + wrmsrl(msr_ops.ctl(i), 0); } return; } @@ -2121,6 +2185,7 @@ static void mce_syscore_resume(void) { __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info)); + __mcheck_cpu_init_clear_banks(); } static struct syscore_ops mce_syscore_ops = { @@ -2138,6 +2203,7 @@ static void mce_cpu_restart(void *data) if (!mce_available(raw_cpu_ptr(&cpu_info))) return; __mcheck_cpu_init_generic(); + __mcheck_cpu_init_clear_banks(); __mcheck_cpu_init_timer(); } @@ -2413,7 +2479,7 @@ static void mce_reenable_cpu(void *h) struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); + wrmsrl(msr_ops.ctl(i), b->ctl); } } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 9d656fd436ef..10b0661651e0 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -54,14 +54,6 @@ /* Threshold LVT offset is at MSR0xC0000410[15:12] */ #define SMCA_THR_LVT_OFF 0xF000 -/* - * OS is required to set the MCAX bit to acknowledge that it is now using the - * new MSR ranges and new registers under each bank. It also means that the OS - * will configure deferred errors in the new MCx_CONFIG register. If the bit is - * not set, uncorrectable errors will cause a system panic. - */ -#define SMCA_MCAX_EN_OFF 0x1 - static const char * const th_names[] = { "load_store", "insn_fetch", @@ -333,7 +325,7 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high, /* Fall back to method we used for older processors: */ switch (block) { case 0: - addr = MSR_IA32_MCx_MISC(bank); + addr = msr_ops.misc(bank); break; case 1: offset = ((low & MASK_BLKPTR_LO) >> 21); @@ -351,6 +343,7 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, int offset, u32 misc_high) { unsigned int cpu = smp_processor_id(); + u32 smca_low, smca_high, smca_addr; struct threshold_block b; int new; @@ -369,24 +362,49 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, b.interrupt_enable = 1; - if (mce_flags.smca) { - u32 smca_low, smca_high; - u32 smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(bank); + if (!mce_flags.smca) { + new = (misc_high & MASK_LVTOFF_HI) >> 20; + goto set_offset; + } - if (!rdmsr_safe(smca_addr, &smca_low, &smca_high)) { - smca_high |= SMCA_MCAX_EN_OFF; - wrmsr(smca_addr, smca_low, smca_high); - } + smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(bank); - /* Gather LVT offset for thresholding: */ - if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high)) - goto out; + if (!rdmsr_safe(smca_addr, &smca_low, &smca_high)) { + /* + * OS is required to set the MCAX bit to acknowledge that it is + * now using the new MSR ranges and new registers under each + * bank. It also means that the OS will configure deferred + * errors in the new MCx_CONFIG register. If the bit is not set, + * uncorrectable errors will cause a system panic. + * + * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.) + */ + smca_high |= BIT(0); - new = (smca_low & SMCA_THR_LVT_OFF) >> 12; - } else { - new = (misc_high & MASK_LVTOFF_HI) >> 20; + /* + * SMCA logs Deferred Error information in MCA_DE{STAT,ADDR} + * registers with the option of additionally logging to + * MCA_{STATUS,ADDR} if MCA_CONFIG[LogDeferredInMcaStat] is set. + * + * This bit is usually set by BIOS to retain the old behavior + * for OSes that don't use the new registers. Linux supports the + * new registers so let's disable that additional logging here. + * + * MCA_CONFIG[LogDeferredInMcaStat] is bit 34 (bit 2 in the high + * portion of the MSR). + */ + smca_high &= ~BIT(2); + + wrmsr(smca_addr, smca_low, smca_high); } + /* Gather LVT offset for thresholding: */ + if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high)) + goto out; + + new = (smca_low & SMCA_THR_LVT_OFF) >> 12; + +set_offset: offset = setup_APIC_mce_threshold(offset, new); if ((offset == new) && (mce_threshold_vector != amd_threshold_interrupt)) @@ -430,12 +448,23 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) deferred_error_interrupt_enable(c); } -static void __log_error(unsigned int bank, bool threshold_err, u64 misc) +static void +__log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) { + u32 msr_status = msr_ops.status(bank); + u32 msr_addr = msr_ops.addr(bank); struct mce m; u64 status; - rdmsrl(MSR_IA32_MCx_STATUS(bank), status); + WARN_ON_ONCE(deferred_err && threshold_err); + + if (deferred_err && mce_flags.smca) { + msr_status = MSR_AMD64_SMCA_MCx_DESTAT(bank); + msr_addr = MSR_AMD64_SMCA_MCx_DEADDR(bank); + } + + rdmsrl(msr_status, status); + if (!(status & MCI_STATUS_VAL)) return; @@ -448,10 +477,11 @@ static void __log_error(unsigned int bank, bool threshold_err, u64 misc) m.misc = misc; if (m.status & MCI_STATUS_ADDRV) - rdmsrl(MSR_IA32_MCx_ADDR(bank), m.addr); + rdmsrl(msr_addr, m.addr); mce_log(&m); - wrmsrl(MSR_IA32_MCx_STATUS(bank), 0); + + wrmsrl(msr_status, 0); } static inline void __smp_deferred_error_interrupt(void) @@ -479,17 +509,21 @@ asmlinkage __visible void smp_trace_deferred_error_interrupt(void) /* APIC interrupt handler for deferred errors */ static void amd_deferred_error_interrupt(void) { - u64 status; unsigned int bank; + u32 msr_status; + u64 status; for (bank = 0; bank < mca_cfg.banks; ++bank) { - rdmsrl(MSR_IA32_MCx_STATUS(bank), status); + msr_status = (mce_flags.smca) ? MSR_AMD64_SMCA_MCx_DESTAT(bank) + : msr_ops.status(bank); + + rdmsrl(msr_status, status); if (!(status & MCI_STATUS_VAL) || !(status & MCI_STATUS_DEFERRED)) continue; - __log_error(bank, false, 0); + __log_error(bank, true, false, 0); break; } } @@ -544,7 +578,7 @@ static void amd_threshold_interrupt(void) return; log: - __log_error(bank, true, ((u64)high << 32) | low); + __log_error(bank, false, true, ((u64)high << 32) | low); } /* diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 1e8bb6c94f14..1defb8ea882c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -84,7 +84,7 @@ static int cmci_supported(int *banks) */ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; - if (!cpu_has_apic || lapic_get_maxlvt() < 6) + if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6) return 0; rdmsrl(MSR_IA32_MCG_CAP, cap); *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index ac780cad3b86..6b9dc4d18ccc 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -450,7 +450,7 @@ asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs) /* Thermal monitoring depends on APIC, ACPI and clock modulation */ static int intel_thermal_supported(struct cpuinfo_x86 *c) { - if (!cpu_has_apic) + if (!boot_cpu_has(X86_FEATURE_APIC)) return 0; if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) return 0; diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index f8c81ba0b465..b1086f79e57e 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -137,7 +137,7 @@ static void prepare_set(void) u32 cr0; /* Save value of CR4 and clear Page Global Enable (bit 7) */ - if (cpu_has_pge) { + if (boot_cpu_has(X86_FEATURE_PGE)) { cr4 = __read_cr4(); __write_cr4(cr4 & ~X86_CR4_PGE); } @@ -170,7 +170,7 @@ static void post_set(void) write_cr0(read_cr0() & ~X86_CR0_CD); /* Restore value of CR4 */ - if (cpu_has_pge) + if (boot_cpu_has(X86_FEATURE_PGE)) __write_cr4(cr4); } diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 19f57360dfd2..16e37a2581ac 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -444,11 +444,24 @@ static void __init print_mtrr_state(void) pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20); } +/* PAT setup for BP. We need to go through sync steps here */ +void __init mtrr_bp_pat_init(void) +{ + unsigned long flags; + + local_irq_save(flags); + prepare_set(); + + pat_init(); + + post_set(); + local_irq_restore(flags); +} + /* Grab all of the MTRR state for this CPU into *state */ bool __init get_mtrr_state(void) { struct mtrr_var_range *vrs; - unsigned long flags; unsigned lo, dummy; unsigned int i; @@ -481,15 +494,6 @@ bool __init get_mtrr_state(void) mtrr_state_set = 1; - /* PAT setup for BP. We need to go through sync steps here */ - local_irq_save(flags); - prepare_set(); - - pat_init(); - - post_set(); - local_irq_restore(flags); - return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED); } @@ -741,7 +745,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) wbinvd(); /* Save value of CR4 and clear Page Global Enable (bit 7) */ - if (cpu_has_pge) { + if (boot_cpu_has(X86_FEATURE_PGE)) { cr4 = __read_cr4(); __write_cr4(cr4 & ~X86_CR4_PGE); } @@ -771,7 +775,7 @@ static void post_set(void) __releases(set_atomicity_lock) write_cr0(read_cr0() & ~X86_CR0_CD); /* Restore value of CR4 */ - if (cpu_has_pge) + if (boot_cpu_has(X86_FEATURE_PGE)) __write_cr4(cr4); raw_spin_unlock(&set_atomicity_lock); } diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 10f8d4796240..7d393ecdeee6 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -752,6 +752,9 @@ void __init mtrr_bp_init(void) /* BIOS may override */ __mtrr_enabled = get_mtrr_state(); + if (mtrr_enabled()) + mtrr_bp_pat_init(); + if (mtrr_cleanup(phys_addr)) { changed_by_mtrr_cleanup = 1; mtrr_if->set_all(); @@ -759,8 +762,16 @@ void __init mtrr_bp_init(void) } } - if (!mtrr_enabled()) + if (!mtrr_enabled()) { pr_info("MTRR: Disabled\n"); + + /* + * PAT initialization relies on MTRR's rendezvous handler. + * Skip PAT init until the handler can initialize both + * features independently. + */ + pat_disable("MTRRs disabled, skipping PAT initialization too."); + } } void mtrr_ap_init(void) diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 951884dcc433..6c7ced07d16d 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -52,6 +52,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); bool get_mtrr_state(void); +void mtrr_bp_pat_init(void); extern void set_mtrr_ops(const struct mtrr_ops *ops); diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 364e58346897..8cac429b6a1d 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -94,7 +94,7 @@ static void __init vmware_platform_setup(void) */ static uint32_t __init vmware_platform(void) { - if (cpu_has_hypervisor) { + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { unsigned int eax; unsigned int hyper_vendor_id[3]; diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 1f4acd68b98b..3fe45f84ced4 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -151,7 +151,7 @@ static void __init dtb_lapic_setup(void) return; /* Did the boot loader setup the local APIC ? */ - if (!cpu_has_apic) { + if (!boot_cpu_has(X86_FEATURE_APIC)) { if (apic_force_enable(r.start)) return; } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 8efa57a5f29e..2bb25c3fe2e8 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -260,19 +260,12 @@ int __die(const char *str, struct pt_regs *regs, long err) unsigned long sp; #endif printk(KERN_DEFAULT - "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif - if (debug_pagealloc_enabled()) - printk("DEBUG_PAGEALLOC "); -#ifdef CONFIG_KASAN - printk("KASAN"); -#endif - printk("\n"); + "%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter, + IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", + IS_ENABLED(CONFIG_SMP) ? " SMP" : "", + debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", + IS_ENABLED(CONFIG_KASAN) ? " KASAN" : ""); + if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) return 1; diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/ebda.c index 992f442ca155..afe65dffee80 100644 --- a/arch/x86/kernel/head.c +++ b/arch/x86/kernel/ebda.c @@ -38,7 +38,7 @@ void __init reserve_ebda_region(void) * that the paravirt case can handle memory setup * correctly, without our help. */ - if (paravirt_enabled()) + if (!x86_platform.legacy.ebda_search) return; /* end of low (conventional) memory */ diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c index dd9ca9b60ff3..aad34aafc0e0 100644 --- a/arch/x86/kernel/fpu/bugs.c +++ b/arch/x86/kernel/fpu/bugs.c @@ -21,11 +21,15 @@ static double __initdata y = 3145727.0; * We should really only care about bugs here * anyway. Not features. */ -static void __init check_fpu(void) +void __init fpu__init_check_bugs(void) { u32 cr0_saved; s32 fdiv_bug; + /* kernel_fpu_begin/end() relies on patched alternative instructions. */ + if (!boot_cpu_has(X86_FEATURE_FPU)) + return; + /* We might have CR0::TS set already, clear it: */ cr0_saved = read_cr0(); write_cr0(cr0_saved & ~X86_CR0_TS); @@ -59,13 +63,3 @@ static void __init check_fpu(void) pr_warn("Hmm, FPU with FDIV bug\n"); } } - -void __init fpu__init_check_bugs(void) -{ - /* - * kernel_fpu_begin/end() in check_fpu() relies on the patched - * alternative instructions. - */ - if (cpu_has_fpu) - check_fpu(); -} diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 8e37cc8a539a..97027545a72d 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -217,14 +217,14 @@ static inline void fpstate_init_fstate(struct fregs_state *fp) void fpstate_init(union fpregs_state *state) { - if (!cpu_has_fpu) { + if (!static_cpu_has(X86_FEATURE_FPU)) { fpstate_init_soft(&state->soft); return; } memset(state, 0, xstate_size); - if (cpu_has_fxsr) + if (static_cpu_has(X86_FEATURE_FXSR)) fpstate_init_fxstate(&state->fxsave); else fpstate_init_fstate(&state->fsave); @@ -237,7 +237,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) dst_fpu->fpregs_active = 0; dst_fpu->last_cpu = -1; - if (!src_fpu->fpstate_active || !cpu_has_fpu) + if (!src_fpu->fpstate_active || !static_cpu_has(X86_FEATURE_FPU)) return 0; WARN_ON_FPU(src_fpu != ¤t->thread.fpu); @@ -506,33 +506,6 @@ void fpu__clear(struct fpu *fpu) * x87 math exception handling: */ -static inline unsigned short get_fpu_cwd(struct fpu *fpu) -{ - if (cpu_has_fxsr) { - return fpu->state.fxsave.cwd; - } else { - return (unsigned short)fpu->state.fsave.cwd; - } -} - -static inline unsigned short get_fpu_swd(struct fpu *fpu) -{ - if (cpu_has_fxsr) { - return fpu->state.fxsave.swd; - } else { - return (unsigned short)fpu->state.fsave.swd; - } -} - -static inline unsigned short get_fpu_mxcsr(struct fpu *fpu) -{ - if (cpu_has_xmm) { - return fpu->state.fxsave.mxcsr; - } else { - return MXCSR_DEFAULT; - } -} - int fpu__exception_code(struct fpu *fpu, int trap_nr) { int err; @@ -547,10 +520,15 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr) * so if this combination doesn't produce any single exception, * then we have a bad program that isn't synchronizing its FPU usage * and it will suffer the consequences since we won't be able to - * fully reproduce the context of the exception + * fully reproduce the context of the exception. */ - cwd = get_fpu_cwd(fpu); - swd = get_fpu_swd(fpu); + if (boot_cpu_has(X86_FEATURE_FXSR)) { + cwd = fpu->state.fxsave.cwd; + swd = fpu->state.fxsave.swd; + } else { + cwd = (unsigned short)fpu->state.fsave.cwd; + swd = (unsigned short)fpu->state.fsave.swd; + } err = swd & ~cwd; } else { @@ -560,7 +538,11 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr) * unmasked exception was caught we must mask the exception mask bits * at 0x1f80, and then use these to mask the exception bits at 0x3f. */ - unsigned short mxcsr = get_fpu_mxcsr(fpu); + unsigned short mxcsr = MXCSR_DEFAULT; + + if (boot_cpu_has(X86_FEATURE_XMM)) + mxcsr = fpu->state.fxsave.mxcsr; + err = ~(mxcsr >> 7) & mxcsr; } diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 54c86fffbf9f..aacfd7a82cec 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -29,22 +29,22 @@ static void fpu__init_cpu_generic(void) unsigned long cr0; unsigned long cr4_mask = 0; - if (cpu_has_fxsr) + if (boot_cpu_has(X86_FEATURE_FXSR)) cr4_mask |= X86_CR4_OSFXSR; - if (cpu_has_xmm) + if (boot_cpu_has(X86_FEATURE_XMM)) cr4_mask |= X86_CR4_OSXMMEXCPT; if (cr4_mask) cr4_set_bits(cr4_mask); cr0 = read_cr0(); cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */ - if (!cpu_has_fpu) + if (!boot_cpu_has(X86_FEATURE_FPU)) cr0 |= X86_CR0_EM; write_cr0(cr0); /* Flush out any pending x87 state: */ #ifdef CONFIG_MATH_EMULATION - if (!cpu_has_fpu) + if (!boot_cpu_has(X86_FEATURE_FPU)) fpstate_init_soft(¤t->thread.fpu.state.soft); else #endif @@ -89,7 +89,7 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) } #ifndef CONFIG_MATH_EMULATION - if (!cpu_has_fpu) { + if (!boot_cpu_has(X86_FEATURE_FPU)) { pr_emerg("x86/fpu: Giving up, no FPU found and no math emulation present\n"); for (;;) asm volatile("hlt"); @@ -106,7 +106,7 @@ static void __init fpu__init_system_mxcsr(void) { unsigned int mask = 0; - if (cpu_has_fxsr) { + if (boot_cpu_has(X86_FEATURE_FXSR)) { /* Static because GCC does not get 16-byte stack alignment right: */ static struct fxregs_state fxregs __initdata; @@ -212,7 +212,7 @@ static void __init fpu__init_system_xstate_size_legacy(void) * fpu__init_system_xstate(). */ - if (!cpu_has_fpu) { + if (!boot_cpu_has(X86_FEATURE_FPU)) { /* * Disable xsave as we do not support it if i387 * emulation is enabled. @@ -221,7 +221,7 @@ static void __init fpu__init_system_xstate_size_legacy(void) setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); xstate_size = sizeof(struct swregs_state); } else { - if (cpu_has_fxsr) + if (boot_cpu_has(X86_FEATURE_FXSR)) xstate_size = sizeof(struct fxregs_state); else xstate_size = sizeof(struct fregs_state); diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 8bd1c003942a..81422dfb152b 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -21,7 +21,10 @@ int regset_xregset_fpregs_active(struct task_struct *target, const struct user_r { struct fpu *target_fpu = &target->thread.fpu; - return (cpu_has_fxsr && target_fpu->fpstate_active) ? regset->n : 0; + if (boot_cpu_has(X86_FEATURE_FXSR) && target_fpu->fpstate_active) + return regset->n; + else + return 0; } int xfpregs_get(struct task_struct *target, const struct user_regset *regset, @@ -30,7 +33,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, { struct fpu *fpu = &target->thread.fpu; - if (!cpu_has_fxsr) + if (!boot_cpu_has(X86_FEATURE_FXSR)) return -ENODEV; fpu__activate_fpstate_read(fpu); @@ -47,7 +50,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, struct fpu *fpu = &target->thread.fpu; int ret; - if (!cpu_has_fxsr) + if (!boot_cpu_has(X86_FEATURE_FXSR)) return -ENODEV; fpu__activate_fpstate_write(fpu); @@ -65,7 +68,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, * update the header bits in the xsave header, indicating the * presence of FP and SSE state. */ - if (cpu_has_xsave) + if (boot_cpu_has(X86_FEATURE_XSAVE)) fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; return ret; @@ -79,7 +82,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, struct xregs_state *xsave; int ret; - if (!cpu_has_xsave) + if (!boot_cpu_has(X86_FEATURE_XSAVE)) return -ENODEV; fpu__activate_fpstate_read(fpu); @@ -108,7 +111,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, struct xregs_state *xsave; int ret; - if (!cpu_has_xsave) + if (!boot_cpu_has(X86_FEATURE_XSAVE)) return -ENODEV; fpu__activate_fpstate_write(fpu); @@ -275,10 +278,10 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, fpu__activate_fpstate_read(fpu); - if (!static_cpu_has(X86_FEATURE_FPU)) + if (!boot_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); - if (!cpu_has_fxsr) + if (!boot_cpu_has(X86_FEATURE_FXSR)) return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &fpu->state.fsave, 0, -1); @@ -306,10 +309,10 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, fpu__activate_fpstate_write(fpu); fpstate_sanitize_xstate(fpu); - if (!static_cpu_has(X86_FEATURE_FPU)) + if (!boot_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); - if (!cpu_has_fxsr) + if (!boot_cpu_has(X86_FEATURE_FXSR)) return user_regset_copyin(&pos, &count, &kbuf, &ubuf, &fpu->state.fsave, 0, -1); @@ -325,7 +328,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, * update the header bit in the xsave header, indicating the * presence of FP. */ - if (cpu_has_xsave) + if (boot_cpu_has(X86_FEATURE_XSAVE)) fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FP; return ret; } diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index b48ef35b28d4..4ea2a59483c7 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -190,7 +190,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu) */ void fpu__init_cpu_xstate(void) { - if (!cpu_has_xsave || !xfeatures_mask) + if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask) return; cr4_set_bits(X86_CR4_OSXSAVE); @@ -280,7 +280,7 @@ static void __init setup_xstate_comp(void) xstate_comp_offsets[0] = 0; xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space); - if (!cpu_has_xsaves) { + if (!boot_cpu_has(X86_FEATURE_XSAVES)) { for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { if (xfeature_enabled(i)) { xstate_comp_offsets[i] = xstate_offsets[i]; @@ -316,13 +316,13 @@ static void __init setup_init_fpu_buf(void) WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; - if (!cpu_has_xsave) + if (!boot_cpu_has(X86_FEATURE_XSAVE)) return; setup_xstate_features(); print_xstate_features(); - if (cpu_has_xsaves) { + if (boot_cpu_has(X86_FEATURE_XSAVES)) { init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask; init_fpstate.xsave.header.xfeatures = xfeatures_mask; } @@ -417,7 +417,7 @@ static int xfeature_size(int xfeature_nr) */ static int using_compacted_format(void) { - return cpu_has_xsaves; + return boot_cpu_has(X86_FEATURE_XSAVES); } static void __xstate_dump_leaves(void) @@ -549,7 +549,7 @@ static unsigned int __init calculate_xstate_size(void) unsigned int eax, ebx, ecx, edx; unsigned int calculated_xstate_size; - if (!cpu_has_xsaves) { + if (!boot_cpu_has(X86_FEATURE_XSAVES)) { /* * - CPUID function 0DH, sub-function 0: * EBX enumerates the size (in bytes) required by @@ -630,7 +630,7 @@ void __init fpu__init_system_xstate(void) WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; - if (!cpu_has_xsave) { + if (!boot_cpu_has(X86_FEATURE_XSAVE)) { pr_info("x86/fpu: Legacy x87 FPU detected.\n"); return; } @@ -667,7 +667,7 @@ void __init fpu__init_system_xstate(void) pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", xfeatures_mask, xstate_size, - cpu_has_xsaves ? "compacted" : "standard"); + boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard"); } /* @@ -678,7 +678,7 @@ void fpu__resume_cpu(void) /* * Restore XCR0 on xsave capable CPUs: */ - if (cpu_has_xsave) + if (boot_cpu_has(X86_FEATURE_XSAVE)) xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); } diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 2911ef3a9f1c..d784bb547a9d 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -34,6 +34,8 @@ asmlinkage __visible void __init i386_start_kernel(void) cr4_init_shadow(); sanitize_boot_params(&boot_params); + x86_early_init_platform_quirks(); + /* Call the subarch specific early setup function */ switch (boot_params.hdr.hardware_subarch) { case X86_SUBARCH_INTEL_MID: diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 1f4422d5c8d0..b72fb0b71dd1 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -182,6 +182,7 @@ void __init x86_64_start_reservations(char *real_mode_data) if (!boot_params.hdr.version) copy_bootdata(__va(real_mode_data)); + x86_early_init_platform_quirks(); reserve_ebda_region(); switch (boot_params.hdr.hardware_subarch) { diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index af1112980dd4..6f8902b0d151 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -555,62 +555,53 @@ early_idt_handler_common: */ cld - cmpl $2,(%esp) # X86_TRAP_NMI - je .Lis_nmi # Ignore NMI - - cmpl $2,%ss:early_recursion_flag - je hlt_loop incl %ss:early_recursion_flag - push %eax # 16(%esp) - push %ecx # 12(%esp) - push %edx # 8(%esp) - push %ds # 4(%esp) - push %es # 0(%esp) - movl $(__KERNEL_DS),%eax - movl %eax,%ds - movl %eax,%es - - cmpl $(__KERNEL_CS),32(%esp) - jne 10f + /* The vector number is in pt_regs->gs */ - leal 28(%esp),%eax # Pointer to %eip - call early_fixup_exception - andl %eax,%eax - jnz ex_entry /* found an exception entry */ - -10: -#ifdef CONFIG_PRINTK - xorl %eax,%eax - movw %ax,2(%esp) /* clean up the segment values on some cpus */ - movw %ax,6(%esp) - movw %ax,34(%esp) - leal 40(%esp),%eax - pushl %eax /* %esp before the exception */ - pushl %ebx - pushl %ebp - pushl %esi - pushl %edi - movl %cr2,%eax - pushl %eax - pushl (20+6*4)(%esp) /* trapno */ - pushl $fault_msg - call printk -#endif - call dump_stack -hlt_loop: - hlt - jmp hlt_loop - -ex_entry: - pop %es - pop %ds - pop %edx - pop %ecx - pop %eax - decl %ss:early_recursion_flag -.Lis_nmi: - addl $8,%esp /* drop vector number and error code */ + cld + pushl %fs /* pt_regs->fs */ + movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */ + pushl %es /* pt_regs->es */ + movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */ + pushl %ds /* pt_regs->ds */ + movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */ + pushl %eax /* pt_regs->ax */ + pushl %ebp /* pt_regs->bp */ + pushl %edi /* pt_regs->di */ + pushl %esi /* pt_regs->si */ + pushl %edx /* pt_regs->dx */ + pushl %ecx /* pt_regs->cx */ + pushl %ebx /* pt_regs->bx */ + + /* Fix up DS and ES */ + movl $(__KERNEL_DS), %ecx + movl %ecx, %ds + movl %ecx, %es + + /* Load the vector number into EDX */ + movl PT_GS(%esp), %edx + + /* Load GS into pt_regs->gs and clear high bits */ + movw %gs, PT_GS(%esp) + movw $0, PT_GS+2(%esp) + + movl %esp, %eax /* args are pt_regs (EAX), trapnr (EDX) */ + call early_fixup_exception + + popl %ebx /* pt_regs->bx */ + popl %ecx /* pt_regs->cx */ + popl %edx /* pt_regs->dx */ + popl %esi /* pt_regs->si */ + popl %edi /* pt_regs->di */ + popl %ebp /* pt_regs->bp */ + popl %eax /* pt_regs->ax */ + popl %ds /* pt_regs->ds */ + popl %es /* pt_regs->es */ + popl %fs /* pt_regs->fs */ + popl %gs /* pt_regs->gs */ + decl %ss:early_recursion_flag + addl $4, %esp /* pop pt_regs->orig_ax */ iret ENDPROC(early_idt_handler_common) @@ -647,10 +638,14 @@ ignore_int: popl %eax #endif iret + +hlt_loop: + hlt + jmp hlt_loop ENDPROC(ignore_int) __INITDATA .align 4 -early_recursion_flag: +GLOBAL(early_recursion_flag) .long 0 __REFDATA @@ -715,19 +710,6 @@ __INITRODATA int_msg: .asciz "Unknown interrupt or fault at: %p %p %p\n" -fault_msg: -/* fault info: */ - .ascii "BUG: Int %d: CR2 %p\n" -/* regs pushed in early_idt_handler: */ - .ascii " EDI %p ESI %p EBP %p EBX %p\n" - .ascii " ESP %p ES %p DS %p\n" - .ascii " EDX %p ECX %p EAX %p\n" -/* fault frame: */ - .ascii " vec %p err %p EIP %p CS %p flg %p\n" - .ascii "Stack: %p %p %p %p %p %p %p %p\n" - .ascii " %p %p %p %p %p %p %p %p\n" - .asciz " %p %p %p %p %p %p %p %p\n" - #include "../../x86/xen/xen-head.S" /* diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 22fbf9df61bb..5df831ef1442 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -20,6 +20,7 @@ #include <asm/processor-flags.h> #include <asm/percpu.h> #include <asm/nops.h> +#include "../entry/calling.h" #ifdef CONFIG_PARAVIRT #include <asm/asm-offsets.h> @@ -64,6 +65,14 @@ startup_64: * tables and then reload them. */ + /* + * Setup stack for verify_cpu(). "-8" because stack_start is defined + * this way, see below. Our best guess is a NULL ptr for stack + * termination heuristics and we don't want to break anything which + * might depend on it (kgdb, ...). + */ + leaq (__end_init_task - 8)(%rip), %rsp + /* Sanitize CPU configuration */ call verify_cpu @@ -350,90 +359,48 @@ early_idt_handler_common: */ cld - cmpl $2,(%rsp) # X86_TRAP_NMI - je .Lis_nmi # Ignore NMI - - cmpl $2,early_recursion_flag(%rip) - jz 1f incl early_recursion_flag(%rip) - pushq %rax # 64(%rsp) - pushq %rcx # 56(%rsp) - pushq %rdx # 48(%rsp) - pushq %rsi # 40(%rsp) - pushq %rdi # 32(%rsp) - pushq %r8 # 24(%rsp) - pushq %r9 # 16(%rsp) - pushq %r10 # 8(%rsp) - pushq %r11 # 0(%rsp) - - cmpl $__KERNEL_CS,96(%rsp) - jne 11f - - cmpl $14,72(%rsp) # Page fault? + /* The vector number is currently in the pt_regs->di slot. */ + pushq %rsi /* pt_regs->si */ + movq 8(%rsp), %rsi /* RSI = vector number */ + movq %rdi, 8(%rsp) /* pt_regs->di = RDI */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq %rax /* pt_regs->ax */ + pushq %r8 /* pt_regs->r8 */ + pushq %r9 /* pt_regs->r9 */ + pushq %r10 /* pt_regs->r10 */ + pushq %r11 /* pt_regs->r11 */ + pushq %rbx /* pt_regs->bx */ + pushq %rbp /* pt_regs->bp */ + pushq %r12 /* pt_regs->r12 */ + pushq %r13 /* pt_regs->r13 */ + pushq %r14 /* pt_regs->r14 */ + pushq %r15 /* pt_regs->r15 */ + + cmpq $14,%rsi /* Page fault? */ jnz 10f - GET_CR2_INTO(%rdi) # can clobber any volatile register if pv + GET_CR2_INTO(%rdi) /* Can clobber any volatile register if pv */ call early_make_pgtable andl %eax,%eax - jz 20f # All good + jz 20f /* All good */ 10: - leaq 88(%rsp),%rdi # Pointer to %rip + movq %rsp,%rdi /* RDI = pt_regs; RSI is already trapnr */ call early_fixup_exception - andl %eax,%eax - jnz 20f # Found an exception entry - -11: -#ifdef CONFIG_EARLY_PRINTK - GET_CR2_INTO(%r9) # can clobber any volatile register if pv - movl 80(%rsp),%r8d # error code - movl 72(%rsp),%esi # vector number - movl 96(%rsp),%edx # %cs - movq 88(%rsp),%rcx # %rip - xorl %eax,%eax - leaq early_idt_msg(%rip),%rdi - call early_printk - cmpl $2,early_recursion_flag(%rip) - jz 1f - call dump_stack -#ifdef CONFIG_KALLSYMS - leaq early_idt_ripmsg(%rip),%rdi - movq 40(%rsp),%rsi # %rip again - call __print_symbol -#endif -#endif /* EARLY_PRINTK */ -1: hlt - jmp 1b - -20: # Exception table entry found or page table generated - popq %r11 - popq %r10 - popq %r9 - popq %r8 - popq %rdi - popq %rsi - popq %rdx - popq %rcx - popq %rax + +20: decl early_recursion_flag(%rip) -.Lis_nmi: - addq $16,%rsp # drop vector number and error code - INTERRUPT_RETURN + jmp restore_regs_and_iret ENDPROC(early_idt_handler_common) __INITDATA .balign 4 -early_recursion_flag: +GLOBAL(early_recursion_flag) .long 0 -#ifdef CONFIG_EARLY_PRINTK -early_idt_msg: - .asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n" -early_idt_ripmsg: - .asciz "RIP %s\n" -#endif /* CONFIG_EARLY_PRINTK */ - #define NEXT_PAGE(name) \ .balign PAGE_SIZE; \ GLOBAL(name) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index a1f0e4a5c47e..f112af7aa62e 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -54,7 +54,7 @@ struct hpet_dev { char name[10]; }; -inline struct hpet_dev *EVT_TO_HPET_DEV(struct clock_event_device *evtdev) +static inline struct hpet_dev *EVT_TO_HPET_DEV(struct clock_event_device *evtdev) { return container_of(evtdev, struct hpet_dev, evt); } @@ -773,7 +773,6 @@ static struct clocksource clocksource_hpet = { .mask = HPET_MASK, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .resume = hpet_resume_counter, - .archdata = { .vclock_mode = VCLOCK_HPET }, }; static int hpet_clocksource_register(void) diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index e565e0e4d216..fc25f698d792 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -13,6 +13,7 @@ #include <linux/cpu.h> #include <asm/kprobes.h> #include <asm/alternative.h> +#include <asm/text-patching.h> #ifdef HAVE_JUMP_LABEL diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 2da6ee9ae69b..04cde527d728 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -45,6 +45,7 @@ #include <linux/uaccess.h> #include <linux/memory.h> +#include <asm/text-patching.h> #include <asm/debugreg.h> #include <asm/apicdef.h> #include <asm/apic.h> diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index ae703acb85c1..38cf7a741250 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -51,6 +51,7 @@ #include <linux/ftrace.h> #include <linux/frame.h> +#include <asm/text-patching.h> #include <asm/cacheflush.h> #include <asm/desc.h> #include <asm/pgtable.h> diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 7b3b9d15c47a..4425f593f0ec 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -29,6 +29,7 @@ #include <linux/kallsyms.h> #include <linux/ftrace.h> +#include <asm/text-patching.h> #include <asm/cacheflush.h> #include <asm/desc.h> #include <asm/pgtable.h> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 807950860fb7..eea2a6f72b31 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -285,14 +285,6 @@ static void __init paravirt_ops_setup(void) { pv_info.name = "KVM"; - /* - * KVM isn't paravirt in the sense of paravirt_enabled. A KVM - * guest kernel works like a bare metal kernel with additional - * features, and paravirt_enabled is about features that are - * missing. - */ - pv_info.paravirt_enabled = 0; - if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) pv_cpu_ops.io_delay = kvm_io_delay; @@ -522,7 +514,7 @@ static noinline uint32_t __kvm_cpuid_base(void) if (boot_cpu_data.cpuid_level < 0) return 0; /* So we don't blow up on old processors */ - if (cpu_has_hypervisor) + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0); return 0; diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 005c03e93fc5..477ae806c2fa 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -31,6 +31,7 @@ #include <linux/jump_label.h> #include <linux/random.h> +#include <asm/text-patching.h> #include <asm/page.h> #include <asm/pgtable.h> #include <asm/setup.h> diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index f08ac28b8136..7b3b3f24c3ea 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -294,7 +294,6 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) struct pv_info pv_info = { .name = "bare hardware", - .paravirt_enabled = 0, .kernel_rpl = 0, .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ @@ -339,8 +338,10 @@ __visible struct pv_cpu_ops pv_cpu_ops = { .write_cr8 = native_write_cr8, #endif .wbinvd = native_wbinvd, - .read_msr = native_read_msr_safe, - .write_msr = native_write_msr_safe, + .read_msr = native_read_msr, + .write_msr = native_write_msr, + .read_msr_safe = native_read_msr_safe, + .write_msr_safe = native_write_msr_safe, .read_pmc = native_read_pmc, .load_tr_desc = native_load_tr_desc, .set_ldt = native_set_ldt, diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c index 35ccf75696eb..f712dfdf1357 100644 --- a/arch/x86/kernel/pci-iommu_table.c +++ b/arch/x86/kernel/pci-iommu_table.c @@ -72,7 +72,7 @@ void __init check_iommu_entries(struct iommu_table_entry *start, } } #else -inline void check_iommu_entries(struct iommu_table_entry *start, +void __init check_iommu_entries(struct iommu_table_entry *start, struct iommu_table_entry *finish) { } diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c new file mode 100644 index 000000000000..b2f8a33b36ff --- /dev/null +++ b/arch/x86/kernel/platform-quirks.c @@ -0,0 +1,35 @@ +#include <linux/kernel.h> +#include <linux/init.h> + +#include <asm/setup.h> +#include <asm/bios_ebda.h> + +void __init x86_early_init_platform_quirks(void) +{ + x86_platform.legacy.rtc = 1; + x86_platform.legacy.ebda_search = 0; + x86_platform.legacy.devices.pnpbios = 1; + + switch (boot_params.hdr.hardware_subarch) { + case X86_SUBARCH_PC: + x86_platform.legacy.ebda_search = 1; + break; + case X86_SUBARCH_XEN: + case X86_SUBARCH_LGUEST: + case X86_SUBARCH_INTEL_MID: + case X86_SUBARCH_CE4100: + x86_platform.legacy.devices.pnpbios = 0; + x86_platform.legacy.rtc = 0; + break; + } + + if (x86_platform.set_legacy_features) + x86_platform.set_legacy_features(); +} + +#if defined(CONFIG_PNPBIOS) +bool __init arch_pnpbios_disabled(void) +{ + return x86_platform.legacy.devices.pnpbios == 0; +} +#endif diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 6cbab31ac23a..6b16c36f0939 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -136,25 +136,6 @@ void release_thread(struct task_struct *dead_task) } } -static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) -{ - struct user_desc ud = { - .base_addr = addr, - .limit = 0xfffff, - .seg_32bit = 1, - .limit_in_pages = 1, - .useable = 1, - }; - struct desc_struct *desc = t->thread.tls_array; - desc += tls; - fill_ldt(desc, &ud); -} - -static inline u32 read_32bit_tls(struct task_struct *t, int tls) -{ - return get_desc_base(&t->thread.tls_array[tls]); -} - int copy_thread_tls(unsigned long clone_flags, unsigned long sp, unsigned long arg, struct task_struct *p, unsigned long tls) { @@ -169,9 +150,9 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, p->thread.io_bitmap_ptr = NULL; savesegment(gs, p->thread.gsindex); - p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs; + p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase; savesegment(fs, p->thread.fsindex); - p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs; + p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase; savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); @@ -210,7 +191,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, */ if (clone_flags & CLONE_SETTLS) { #ifdef CONFIG_IA32_EMULATION - if (is_ia32_task()) + if (in_ia32_syscall()) err = do_set_thread_area(p, -1, (struct user_desc __user *)tls, 0); else @@ -282,7 +263,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(cpu_tss, cpu); - unsigned fsindex, gsindex; + unsigned prev_fsindex, prev_gsindex; fpu_switch_t fpu_switch; fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu); @@ -292,8 +273,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * * (e.g. xen_load_tls()) */ - savesegment(fs, fsindex); - savesegment(gs, gsindex); + savesegment(fs, prev_fsindex); + savesegment(gs, prev_gsindex); /* * Load TLS before restoring any segments so that segment loads @@ -336,66 +317,104 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * Switch FS and GS. * * These are even more complicated than DS and ES: they have - * 64-bit bases are that controlled by arch_prctl. Those bases - * only differ from the values in the GDT or LDT if the selector - * is 0. - * - * Loading the segment register resets the hidden base part of - * the register to 0 or the value from the GDT / LDT. If the - * next base address zero, writing 0 to the segment register is - * much faster than using wrmsr to explicitly zero the base. - * - * The thread_struct.fs and thread_struct.gs values are 0 - * if the fs and gs bases respectively are not overridden - * from the values implied by fsindex and gsindex. They - * are nonzero, and store the nonzero base addresses, if - * the bases are overridden. - * - * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should - * be impossible. - * - * Therefore we need to reload the segment registers if either - * the old or new selector is nonzero, and we need to override - * the base address if next thread expects it to be overridden. + * 64-bit bases are that controlled by arch_prctl. The bases + * don't necessarily match the selectors, as user code can do + * any number of things to cause them to be inconsistent. * - * This code is unnecessarily slow in the case where the old and - * new indexes are zero and the new base is nonzero -- it will - * unnecessarily write 0 to the selector before writing the new - * base address. + * We don't promise to preserve the bases if the selectors are + * nonzero. We also don't promise to preserve the base if the + * selector is zero and the base doesn't match whatever was + * most recently passed to ARCH_SET_FS/GS. (If/when the + * FSGSBASE instructions are enabled, we'll need to offer + * stronger guarantees.) * - * Note: This all depends on arch_prctl being the only way that - * user code can override the segment base. Once wrfsbase and - * wrgsbase are enabled, most of this code will need to change. + * As an invariant, + * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is + * impossible. */ - if (unlikely(fsindex | next->fsindex | prev->fs)) { + if (next->fsindex) { + /* Loading a nonzero value into FS sets the index and base. */ loadsegment(fs, next->fsindex); - - /* - * If user code wrote a nonzero value to FS, then it also - * cleared the overridden base address. - * - * XXX: if user code wrote 0 to FS and cleared the base - * address itself, we won't notice and we'll incorrectly - * restore the prior base address next time we reschdule - * the process. - */ - if (fsindex) - prev->fs = 0; + } else { + if (next->fsbase) { + /* Next index is zero but next base is nonzero. */ + if (prev_fsindex) + loadsegment(fs, 0); + wrmsrl(MSR_FS_BASE, next->fsbase); + } else { + /* Next base and index are both zero. */ + if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { + /* + * We don't know the previous base and can't + * find out without RDMSR. Forcibly clear it. + */ + loadsegment(fs, __USER_DS); + loadsegment(fs, 0); + } else { + /* + * If the previous index is zero and ARCH_SET_FS + * didn't change the base, then the base is + * also zero and we don't need to do anything. + */ + if (prev->fsbase || prev_fsindex) + loadsegment(fs, 0); + } + } } - if (next->fs) - wrmsrl(MSR_FS_BASE, next->fs); - prev->fsindex = fsindex; + /* + * Save the old state and preserve the invariant. + * NB: if prev_fsindex == 0, then we can't reliably learn the base + * without RDMSR because Intel user code can zero it without telling + * us and AMD user code can program any 32-bit value without telling + * us. + */ + if (prev_fsindex) + prev->fsbase = 0; + prev->fsindex = prev_fsindex; - if (unlikely(gsindex | next->gsindex | prev->gs)) { + if (next->gsindex) { + /* Loading a nonzero value into GS sets the index and base. */ load_gs_index(next->gsindex); - - /* This works (and fails) the same way as fsindex above. */ - if (gsindex) - prev->gs = 0; + } else { + if (next->gsbase) { + /* Next index is zero but next base is nonzero. */ + if (prev_gsindex) + load_gs_index(0); + wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase); + } else { + /* Next base and index are both zero. */ + if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { + /* + * We don't know the previous base and can't + * find out without RDMSR. Forcibly clear it. + * + * This contains a pointless SWAPGS pair. + * Fixing it would involve an explicit check + * for Xen or a new pvop. + */ + load_gs_index(__USER_DS); + load_gs_index(0); + } else { + /* + * If the previous index is zero and ARCH_SET_GS + * didn't change the base, then the base is + * also zero and we don't need to do anything. + */ + if (prev->gsbase || prev_gsindex) + load_gs_index(0); + } + } } - if (next->gs) - wrmsrl(MSR_KERNEL_GS_BASE, next->gs); - prev->gsindex = gsindex; + /* + * Save the old state and preserve the invariant. + * NB: if prev_gsindex == 0, then we can't reliably learn the base + * without RDMSR because Intel user code can zero it without telling + * us and AMD user code can program any 32-bit value without telling + * us. + */ + if (prev_gsindex) + prev->gsbase = 0; + prev->gsindex = prev_gsindex; switch_fpu_finish(next_fpu, fpu_switch); @@ -516,23 +535,11 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) if (addr >= TASK_SIZE_OF(task)) return -EPERM; cpu = get_cpu(); - /* handle small bases via the GDT because that's faster to - switch. */ - if (addr <= 0xffffffff) { - set_32bit_tls(task, GS_TLS, addr); - if (doit) { - load_TLS(&task->thread, cpu); - load_gs_index(GS_TLS_SEL); - } - task->thread.gsindex = GS_TLS_SEL; - task->thread.gs = 0; - } else { - task->thread.gsindex = 0; - task->thread.gs = addr; - if (doit) { - load_gs_index(0); - ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr); - } + task->thread.gsindex = 0; + task->thread.gsbase = addr; + if (doit) { + load_gs_index(0); + ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr); } put_cpu(); break; @@ -542,52 +549,30 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) if (addr >= TASK_SIZE_OF(task)) return -EPERM; cpu = get_cpu(); - /* handle small bases via the GDT because that's faster to - switch. */ - if (addr <= 0xffffffff) { - set_32bit_tls(task, FS_TLS, addr); - if (doit) { - load_TLS(&task->thread, cpu); - loadsegment(fs, FS_TLS_SEL); - } - task->thread.fsindex = FS_TLS_SEL; - task->thread.fs = 0; - } else { - task->thread.fsindex = 0; - task->thread.fs = addr; - if (doit) { - /* set the selector to 0 to not confuse - __switch_to */ - loadsegment(fs, 0); - ret = wrmsrl_safe(MSR_FS_BASE, addr); - } + task->thread.fsindex = 0; + task->thread.fsbase = addr; + if (doit) { + /* set the selector to 0 to not confuse __switch_to */ + loadsegment(fs, 0); + ret = wrmsrl_safe(MSR_FS_BASE, addr); } put_cpu(); break; case ARCH_GET_FS: { unsigned long base; - if (task->thread.fsindex == FS_TLS_SEL) - base = read_32bit_tls(task, FS_TLS); - else if (doit) + if (doit) rdmsrl(MSR_FS_BASE, base); else - base = task->thread.fs; + base = task->thread.fsbase; ret = put_user(base, (unsigned long __user *)addr); break; } case ARCH_GET_GS: { unsigned long base; - unsigned gsindex; - if (task->thread.gsindex == GS_TLS_SEL) - base = read_32bit_tls(task, GS_TLS); - else if (doit) { - savesegment(gs, gsindex); - if (gsindex) - rdmsrl(MSR_KERNEL_GS_BASE, base); - else - base = task->thread.gs; - } else - base = task->thread.gs; + if (doit) + rdmsrl(MSR_KERNEL_GS_BASE, base); + else + base = task->thread.gsbase; ret = put_user(base, (unsigned long __user *)addr); break; } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 32e9d9cbb884..e60ef918f53d 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -303,29 +303,11 @@ static int set_segment_reg(struct task_struct *task, switch (offset) { case offsetof(struct user_regs_struct,fs): - /* - * If this is setting fs as for normal 64-bit use but - * setting fs_base has implicitly changed it, leave it. - */ - if ((value == FS_TLS_SEL && task->thread.fsindex == 0 && - task->thread.fs != 0) || - (value == 0 && task->thread.fsindex == FS_TLS_SEL && - task->thread.fs == 0)) - break; task->thread.fsindex = value; if (task == current) loadsegment(fs, task->thread.fsindex); break; case offsetof(struct user_regs_struct,gs): - /* - * If this is setting gs as for normal 64-bit use but - * setting gs_base has implicitly changed it, leave it. - */ - if ((value == GS_TLS_SEL && task->thread.gsindex == 0 && - task->thread.gs != 0) || - (value == 0 && task->thread.gsindex == GS_TLS_SEL && - task->thread.gs == 0)) - break; task->thread.gsindex = value; if (task == current) load_gs_index(task->thread.gsindex); @@ -417,7 +399,7 @@ static int putreg(struct task_struct *child, * to set either thread.fs or thread.fsindex and the * corresponding GDT slot. */ - if (child->thread.fs != value) + if (child->thread.fsbase != value) return do_arch_prctl(child, ARCH_SET_FS, value); return 0; case offsetof(struct user_regs_struct,gs_base): @@ -426,7 +408,7 @@ static int putreg(struct task_struct *child, */ if (value >= TASK_SIZE_OF(child)) return -EIO; - if (child->thread.gs != value) + if (child->thread.gsbase != value) return do_arch_prctl(child, ARCH_SET_GS, value); return 0; #endif @@ -453,31 +435,17 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset) #ifdef CONFIG_X86_64 case offsetof(struct user_regs_struct, fs_base): { /* - * do_arch_prctl may have used a GDT slot instead of - * the MSR. To userland, it appears the same either - * way, except the %fs segment selector might not be 0. + * XXX: This will not behave as expected if called on + * current or if fsindex != 0. */ - unsigned int seg = task->thread.fsindex; - if (task->thread.fs != 0) - return task->thread.fs; - if (task == current) - asm("movl %%fs,%0" : "=r" (seg)); - if (seg != FS_TLS_SEL) - return 0; - return get_desc_base(&task->thread.tls_array[FS_TLS]); + return task->thread.fsbase; } case offsetof(struct user_regs_struct, gs_base): { /* - * Exactly the same here as the %fs handling above. + * XXX: This will not behave as expected if called on + * current or if fsindex != 0. */ - unsigned int seg = task->thread.gsindex; - if (task->thread.gs != 0) - return task->thread.gs; - if (task == current) - asm("movl %%gs,%0" : "=r" (seg)); - if (seg != GS_TLS_SEL) - return 0; - return get_desc_base(&task->thread.tls_array[GS_TLS]); + return task->thread.gsbase; } #endif } @@ -1266,7 +1234,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t caddr, compat_ulong_t cdata) { #ifdef CONFIG_X86_X32_ABI - if (!is_ia32_task()) + if (!in_ia32_syscall()) return x32_arch_ptrace(child, request, caddr, cdata); #endif #ifdef CONFIG_IA32_EMULATION diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index ab0adc0fa5db..a9b31eb815f2 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -535,6 +535,15 @@ static void native_machine_emergency_restart(void) mode = reboot_mode == REBOOT_WARM ? 0x1234 : 0; *((unsigned short *)__va(0x472)) = mode; + /* + * If an EFI capsule has been registered with the firmware then + * override the reboot= parameter. + */ + if (efi_capsule_pending(NULL)) { + pr_info("EFI capsule is pending, forcing EFI reboot.\n"); + reboot_type = BOOT_EFI; + } + for (;;) { /* Could also try the reset bit in the Hammer NB */ switch (reboot_type) { diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 4af8d063fb36..eceaa082ec3f 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -14,6 +14,7 @@ #include <asm/time.h> #include <asm/intel-mid.h> #include <asm/rtc.h> +#include <asm/setup.h> #ifdef CONFIG_X86_32 /* @@ -185,22 +186,7 @@ static __init int add_rtc_cmos(void) } } #endif - if (of_have_populated_dt()) - return 0; - - /* Intel MID platforms don't have ioport rtc */ - if (intel_mid_identify_cpu()) - return -ENODEV; - -#ifdef CONFIG_ACPI - if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) { - /* This warning can likely go away again in a year or two. */ - pr_info("ACPI: not registering RTC platform device\n"); - return -ENODEV; - } -#endif - - if (paravirt_enabled() && !paravirt_has(RTC)) + if (!x86_platform.legacy.rtc) return -ENODEV; platform_device_register(&rtc_device); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 548ddf7d6fd2..22cc2f9f8aec 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -248,18 +248,17 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, if (config_enabled(CONFIG_X86_64)) sp -= 128; - if (!onsigstack) { - /* This is the X/Open sanctioned signal stack switching. */ - if (ka->sa.sa_flags & SA_ONSTACK) { - if (current->sas_ss_size) - sp = current->sas_ss_sp + current->sas_ss_size; - } else if (config_enabled(CONFIG_X86_32) && - (regs->ss & 0xffff) != __USER_DS && - !(ka->sa.sa_flags & SA_RESTORER) && - ka->sa.sa_restorer) { - /* This is the legacy signal stack switching. */ - sp = (unsigned long) ka->sa.sa_restorer; - } + /* This is the X/Open sanctioned signal stack switching. */ + if (ka->sa.sa_flags & SA_ONSTACK) { + if (sas_ss_flags(sp) == 0) + sp = current->sas_ss_sp + current->sas_ss_size; + } else if (config_enabled(CONFIG_X86_32) && + !onsigstack && + (regs->ss & 0xffff) != __USER_DS && + !(ka->sa.sa_flags & SA_RESTORER) && + ka->sa.sa_restorer) { + /* This is the legacy signal stack switching. */ + sp = (unsigned long) ka->sa.sa_restorer; } if (fpu->fpstate_active) { @@ -391,7 +390,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, put_user_ex(&frame->uc, &frame->puc); /* Create the ucontext. */ - if (cpu_has_xsave) + if (boot_cpu_has(X86_FEATURE_XSAVE)) put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); else put_user_ex(0, &frame->uc.uc_flags); @@ -442,7 +441,7 @@ static unsigned long frame_uc_flags(struct pt_regs *regs) { unsigned long flags; - if (cpu_has_xsave) + if (boot_cpu_has(X86_FEATURE_XSAVE)) flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS; else flags = UC_SIGCONTEXT_SS; @@ -762,7 +761,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) { #ifdef CONFIG_X86_64 - if (is_ia32_task()) + if (in_ia32_syscall()) return __NR_ia32_restart_syscall; #endif #ifdef CONFIG_X86_X32_ABI diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 0e4329ed91ef..fafe8b923cac 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1236,7 +1236,7 @@ static int __init smp_sanity_check(unsigned max_cpus) * If we couldn't find a local APIC, then get out of here now! */ if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && - !cpu_has_apic) { + !boot_cpu_has(X86_FEATURE_APIC)) { if (!disable_apic) { pr_err("BIOS bug, local APIC #%d not detected!...\n", boot_cpu_physical_apicid); diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c index 5da924bbf0a0..623965e86b65 100644 --- a/arch/x86/kernel/sysfb_efi.c +++ b/arch/x86/kernel/sysfb_efi.c @@ -68,6 +68,21 @@ struct efifb_dmi_info efifb_dmi_list[] = { [M_UNKNOWN] = { NULL, 0, 0, 0, 0, OVERRIDE_NONE } }; +void efifb_setup_from_dmi(struct screen_info *si, const char *opt) +{ + int i; + + for (i = 0; i < M_UNKNOWN; i++) { + if (efifb_dmi_list[i].base != 0 && + !strcmp(opt, efifb_dmi_list[i].optname)) { + si->lfb_base = efifb_dmi_list[i].base; + si->lfb_linelength = efifb_dmi_list[i].stride; + si->lfb_width = efifb_dmi_list[i].width; + si->lfb_height = efifb_dmi_list[i].height; + } + } +} + #define choose_value(dmivalue, fwvalue, field, flags) ({ \ typeof(fwvalue) _ret_ = fwvalue; \ if ((flags) & (field)) \ diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index e72a07f20b05..9b0185fbe3eb 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -74,12 +74,6 @@ void __init tboot_probe(void) return; } - /* only a natively booted kernel should be using TXT */ - if (paravirt_enabled()) { - pr_warning("non-0 tboot_addr but pv_ops is enabled\n"); - return; - } - /* Map and check for tboot UUID. */ set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr); tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE); diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c index ab40954e113e..f386bad0984e 100644 --- a/arch/x86/kernel/tce_64.c +++ b/arch/x86/kernel/tce_64.c @@ -40,7 +40,7 @@ static inline void flush_tce(void* tceaddr) { /* a single tce can't cross a cache line */ - if (cpu_has_clflush) + if (boot_cpu_has(X86_FEATURE_CLFLUSH)) clflush(tceaddr); else wbinvd(); diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 7fc5e843f247..9692a5e9fdab 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -114,6 +114,7 @@ int do_set_thread_area(struct task_struct *p, int idx, int can_allocate) { struct user_desc info; + unsigned short __maybe_unused sel, modified_sel; if (copy_from_user(&info, u_info, sizeof(info))) return -EFAULT; @@ -141,6 +142,47 @@ int do_set_thread_area(struct task_struct *p, int idx, set_tls_desc(p, idx, &info, 1); + /* + * If DS, ES, FS, or GS points to the modified segment, forcibly + * refresh it. Only needed on x86_64 because x86_32 reloads them + * on return to user mode. + */ + modified_sel = (idx << 3) | 3; + + if (p == current) { +#ifdef CONFIG_X86_64 + savesegment(ds, sel); + if (sel == modified_sel) + loadsegment(ds, sel); + + savesegment(es, sel); + if (sel == modified_sel) + loadsegment(es, sel); + + savesegment(fs, sel); + if (sel == modified_sel) + loadsegment(fs, sel); + + savesegment(gs, sel); + if (sel == modified_sel) + load_gs_index(sel); +#endif + +#ifdef CONFIG_X86_32_LAZY_GS + savesegment(gs, sel); + if (sel == modified_sel) + loadsegment(gs, sel); +#endif + } else { +#ifdef CONFIG_X86_64 + if (p->thread.fsindex == modified_sel) + p->thread.fsbase = info.base_addr; + + if (p->thread.gsindex == modified_sel) + p->thread.gsbase = info.base_addr; +#endif + } + return 0; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 06cbe25861f1..d1590486204a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -51,6 +51,7 @@ #include <asm/processor.h> #include <asm/debugreg.h> #include <linux/atomic.h> +#include <asm/text-patching.h> #include <asm/ftrace.h> #include <asm/traps.h> #include <asm/desc.h> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index c9c4c7ce3eb2..38ba6de56ede 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -36,7 +36,7 @@ static int __read_mostly tsc_unstable; /* native_sched_clock() is called before tsc_init(), so we must start with the TSC soft disabled to prevent - erroneous rdtsc usage on !cpu_has_tsc processors */ + erroneous rdtsc usage on !boot_cpu_has(X86_FEATURE_TSC) processors */ static int __read_mostly tsc_disabled = -1; static DEFINE_STATIC_KEY_FALSE(__use_tsc); @@ -834,15 +834,15 @@ int recalibrate_cpu_khz(void) #ifndef CONFIG_SMP unsigned long cpu_khz_old = cpu_khz; - if (cpu_has_tsc) { - tsc_khz = x86_platform.calibrate_tsc(); - cpu_khz = tsc_khz; - cpu_data(0).loops_per_jiffy = - cpufreq_scale(cpu_data(0).loops_per_jiffy, - cpu_khz_old, cpu_khz); - return 0; - } else + if (!boot_cpu_has(X86_FEATURE_TSC)) return -ENODEV; + + tsc_khz = x86_platform.calibrate_tsc(); + cpu_khz = tsc_khz; + cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy, + cpu_khz_old, cpu_khz); + + return 0; #else return -ENODEV; #endif @@ -922,9 +922,6 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, struct cpufreq_freqs *freq = data; unsigned long *lpj; - if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC)) - return 0; - lpj = &boot_cpu_data.loops_per_jiffy; #ifdef CONFIG_SMP if (!(freq->flags & CPUFREQ_CONST_LOOPS)) @@ -954,9 +951,9 @@ static struct notifier_block time_cpufreq_notifier_block = { .notifier_call = time_cpufreq_notifier }; -static int __init cpufreq_tsc(void) +static int __init cpufreq_register_tsc_scaling(void) { - if (!cpu_has_tsc) + if (!boot_cpu_has(X86_FEATURE_TSC)) return 0; if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) return 0; @@ -965,7 +962,7 @@ static int __init cpufreq_tsc(void) return 0; } -core_initcall(cpufreq_tsc); +core_initcall(cpufreq_register_tsc_scaling); #endif /* CONFIG_CPU_FREQ */ @@ -1081,7 +1078,7 @@ static void __init check_system_tsc_reliable(void) */ int unsynchronized_tsc(void) { - if (!cpu_has_tsc || tsc_unstable) + if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_unstable) return 1; #ifdef CONFIG_SMP @@ -1205,7 +1202,7 @@ out: static int __init init_tsc_clocksource(void) { - if (!cpu_has_tsc || tsc_disabled > 0 || !tsc_khz) + if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_disabled > 0 || !tsc_khz) return 0; if (tsc_clocksource_reliable) @@ -1242,7 +1239,7 @@ void __init tsc_init(void) u64 lpj; int cpu; - if (!cpu_has_tsc) { + if (!boot_cpu_has(X86_FEATURE_TSC)) { setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); return; } diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index bf4db6eaec8f..6c1ff31d99ff 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -516,7 +516,7 @@ struct uprobe_xol_ops { static inline int sizeof_long(void) { - return is_ia32_task() ? 4 : 8; + return in_ia32_syscall() ? 4 : 8; } static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) @@ -578,7 +578,7 @@ static void default_abort_op(struct arch_uprobe *auprobe, struct pt_regs *regs) riprel_post_xol(auprobe, regs); } -static struct uprobe_xol_ops default_xol_ops = { +static const struct uprobe_xol_ops default_xol_ops = { .pre_xol = default_pre_xol_op, .post_xol = default_post_xol_op, .abort = default_abort_op, @@ -695,7 +695,7 @@ static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn) 0, insn->immediate.nbytes); } -static struct uprobe_xol_ops branch_xol_ops = { +static const struct uprobe_xol_ops branch_xol_ops = { .emulate = branch_emulate_op, .post_xol = branch_post_xol_op, }; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 4c941f88d405..9297a002d8e5 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -334,7 +334,7 @@ SECTIONS __brk_limit = .; } - . = ALIGN(PAGE_SIZE); + . = ALIGN(PAGE_SIZE); /* keep VO_INIT_SIZE page aligned */ _end = .; STABS_DEBUG diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index bbbaa802d13e..769af907f824 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -75,7 +75,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) return 0; /* Update OSXSAVE bit */ - if (cpu_has_xsave && best->function == 0x1) { + if (boot_cpu_has(X86_FEATURE_XSAVE) && best->function == 0x1) { best->ecx &= ~F(OSXSAVE); if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) best->ecx |= F(OSXSAVE); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b6f50e8b0a39..38c0c32926c9 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3844,7 +3844,8 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, boot_cpu_data.x86_phys_bits, context->shadow_root_level, false, - cpu_has_gbpages, true, true); + boot_cpu_has(X86_FEATURE_GBPAGES), + true, true); else __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, boot_cpu_data.x86_phys_bits, diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 31346a3f20a5..fafd720ce10a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1254,7 +1254,7 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) kvm_load_ldt(svm->host.ldt); #ifdef CONFIG_X86_64 loadsegment(fs, svm->host.fs); - wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); + wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase); load_gs_index(svm->host.gs); #else #ifdef CONFIG_X86_32_LAZY_GS diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 2f1ea2f61e1f..b72743c5668d 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -809,8 +809,7 @@ TRACE_EVENT(kvm_write_tsc_offset, #define host_clocks \ {VCLOCK_NONE, "none"}, \ - {VCLOCK_TSC, "tsc"}, \ - {VCLOCK_HPET, "hpet"} \ + {VCLOCK_TSC, "tsc"} \ TRACE_EVENT(kvm_update_master_clock, TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched), diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 133679d520af..cb47fe3da292 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3390,7 +3390,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) } } - if (cpu_has_xsaves) + if (boot_cpu_has(X86_FEATURE_XSAVES)) rdmsrl(MSR_IA32_XSS, host_xss); return 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9b7798c7b210..12f33e662382 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2611,7 +2611,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_MAX_MCE_BANKS; break; case KVM_CAP_XCRS: - r = cpu_has_xsave; + r = boot_cpu_has(X86_FEATURE_XSAVE); break; case KVM_CAP_TSC_CONTROL: r = kvm_has_tsc_control; @@ -3094,7 +3094,7 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) /* Set XSTATE_BV and possibly XCOMP_BV. */ xsave->header.xfeatures = xstate_bv; - if (cpu_has_xsaves) + if (boot_cpu_has(X86_FEATURE_XSAVES)) xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; /* @@ -3121,7 +3121,7 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { - if (cpu_has_xsave) { + if (boot_cpu_has(X86_FEATURE_XSAVE)) { memset(guest_xsave, 0, sizeof(struct kvm_xsave)); fill_xsave((u8 *) guest_xsave->region, vcpu); } else { @@ -3139,7 +3139,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, u64 xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; - if (cpu_has_xsave) { + if (boot_cpu_has(X86_FEATURE_XSAVE)) { /* * Here we allow setting states that are not present in * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility @@ -3160,7 +3160,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, struct kvm_xcrs *guest_xcrs) { - if (!cpu_has_xsave) { + if (!boot_cpu_has(X86_FEATURE_XSAVE)) { guest_xcrs->nr_xcrs = 0; return; } @@ -3176,7 +3176,7 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, { int i, r = 0; - if (!cpu_has_xsave) + if (!boot_cpu_has(X86_FEATURE_XSAVE)) return -EINVAL; if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags) @@ -5865,7 +5865,7 @@ int kvm_arch_init(void *opaque) perf_register_guest_info_callbacks(&kvm_guest_cbs); - if (cpu_has_xsave) + if (boot_cpu_has(X86_FEATURE_XSAVE)) host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); kvm_lapic_init(); @@ -7293,7 +7293,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) static void fx_init(struct kvm_vcpu *vcpu) { fpstate_init(&vcpu->arch.guest_fpu.state); - if (cpu_has_xsaves) + if (boot_cpu_has(X86_FEATURE_XSAVES)) vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index fd57d3ae7e16..3847e736702e 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1408,13 +1408,10 @@ __init void lguest_init(void) { /* We're under lguest. */ pv_info.name = "lguest"; - /* Paravirt is enabled. */ - pv_info.paravirt_enabled = 1; /* We're running at privilege level 1, not 0 as normal. */ pv_info.kernel_rpl = 1; /* Everyone except Xen runs with this set. */ pv_info.shared_kernel_pmd = 1; - pv_info.features = 0; /* * We set up all the lguest overrides for sensitive operations. These diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S index be110efa0096..bf2c6074efd2 100644 --- a/arch/x86/lib/rwsem.S +++ b/arch/x86/lib/rwsem.S @@ -29,8 +29,10 @@ * there is contention on the semaphore. * * %eax contains the semaphore pointer on entry. Save the C-clobbered - * registers (%eax, %edx and %ecx) except %eax whish is either a return - * value or just clobbered.. + * registers (%eax, %edx and %ecx) except %eax which is either a return + * value or just gets clobbered. Same is true for %edx so make sure GCC + * reloads it after the slow path, by making it hold a temporary, for + * example see ____down_write(). */ #define save_common_regs \ @@ -106,6 +108,16 @@ ENTRY(call_rwsem_down_write_failed) ret ENDPROC(call_rwsem_down_write_failed) +ENTRY(call_rwsem_down_write_failed_killable) + FRAME_BEGIN + save_common_regs + movq %rax,%rdi + call rwsem_down_write_failed_killable + restore_common_regs + FRAME_END + ret +ENDPROC(call_rwsem_down_write_failed_killable) + ENTRY(call_rwsem_wake) FRAME_BEGIN /* do nothing if still outstanding active readers */ diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 91d93b95bd86..b559d9238781 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -612,7 +612,7 @@ unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from, { stac(); #ifdef CONFIG_X86_INTEL_USERCOPY - if (n > 64 && cpu_has_xmm2) + if (n > 64 && static_cpu_has(X86_FEATURE_XMM2)) n = __copy_user_zeroing_intel_nocache(to, from, n); else __copy_user_zeroing(to, from, n); @@ -629,7 +629,7 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr { stac(); #ifdef CONFIG_X86_INTEL_USERCOPY - if (n > 64 && cpu_has_xmm2) + if (n > 64 && static_cpu_has(X86_FEATURE_XMM2)) n = __copy_user_intel_nocache(to, from, n); else __copy_user(to, from, n); diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index f98913258c63..62c0043a5fd5 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -2,7 +2,7 @@ KCOV_INSTRUMENT_tlb.o := n obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o pgtable.o physaddr.o gup.o setup_nx.o + pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o # Make sure __phys_addr has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) @@ -12,7 +12,6 @@ CFLAGS_setup_nx.o := $(nostackp) CFLAGS_fault.o := -I$(src)/../include/asm/trace obj-$(CONFIG_X86_PAT) += pat_rbtree.o -obj-$(CONFIG_SMP) += tlb.o obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 82447b3fba38..4bb53b89f3c5 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -1,5 +1,6 @@ #include <linux/module.h> #include <asm/uaccess.h> +#include <asm/traps.h> typedef bool (*ex_handler_t)(const struct exception_table_entry *, struct pt_regs *, int); @@ -42,6 +43,43 @@ bool ex_handler_ext(const struct exception_table_entry *fixup, } EXPORT_SYMBOL(ex_handler_ext); +bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + WARN_ONCE(1, "unchecked MSR access error: RDMSR from 0x%x\n", + (unsigned int)regs->cx); + + /* Pretend that the read succeeded and returned 0. */ + regs->ip = ex_fixup_addr(fixup); + regs->ax = 0; + regs->dx = 0; + return true; +} +EXPORT_SYMBOL(ex_handler_rdmsr_unsafe); + +bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + WARN_ONCE(1, "unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x)\n", + (unsigned int)regs->cx, + (unsigned int)regs->dx, (unsigned int)regs->ax); + + /* Pretend that the write succeeded. */ + regs->ip = ex_fixup_addr(fixup); + return true; +} +EXPORT_SYMBOL(ex_handler_wrmsr_unsafe); + +bool ex_handler_clear_fs(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + if (static_cpu_has(X86_BUG_NULL_SEG)) + asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS)); + asm volatile ("mov %0, %%fs" : : "rm" (0)); + return ex_handler_default(fixup, regs, trapnr); +} +EXPORT_SYMBOL(ex_handler_clear_fs); + bool ex_has_fault_handler(unsigned long ip) { const struct exception_table_entry *e; @@ -82,24 +120,46 @@ int fixup_exception(struct pt_regs *regs, int trapnr) return handler(e, regs, trapnr); } +extern unsigned int early_recursion_flag; + /* Restricted version used during very early boot */ -int __init early_fixup_exception(unsigned long *ip) +void __init early_fixup_exception(struct pt_regs *regs, int trapnr) { - const struct exception_table_entry *e; - unsigned long new_ip; - ex_handler_t handler; - - e = search_exception_tables(*ip); - if (!e) - return 0; - - new_ip = ex_fixup_addr(e); - handler = ex_fixup_handler(e); - - /* special handling not supported during early boot */ - if (handler != ex_handler_default) - return 0; - - *ip = new_ip; - return 1; + /* Ignore early NMIs. */ + if (trapnr == X86_TRAP_NMI) + return; + + if (early_recursion_flag > 2) + goto halt_loop; + + if (regs->cs != __KERNEL_CS) + goto fail; + + /* + * The full exception fixup machinery is available as soon as + * the early IDT is loaded. This means that it is the + * responsibility of extable users to either function correctly + * when handlers are invoked early or to simply avoid causing + * exceptions before they're ready to handle them. + * + * This is better than filtering which handlers can be used, + * because refusing to call a handler here is guaranteed to + * result in a hard-to-debug panic. + * + * Keep in mind that not all vectors actually get here. Early + * fage faults, for example, are special. + */ + if (fixup_exception(regs, trapnr)) + return; + +fail: + early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n", + (unsigned)trapnr, (unsigned long)regs->cs, regs->ip, + regs->orig_ax, read_cr2()); + + show_regs(regs); + +halt_loop: + while (true) + halt(); } diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 740d7ac03a55..14a95054d4e0 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -162,7 +162,7 @@ static __init int setup_hugepagesz(char *opt) unsigned long ps = memparse(opt, &opt); if (ps == PMD_SIZE) { hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); - } else if (ps == PUD_SIZE && cpu_has_gbpages) { + } else if (ps == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) { hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); } else { printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n", @@ -177,7 +177,7 @@ __setup("hugepagesz=", setup_hugepagesz); static __init int gigantic_pages_init(void) { /* With compaction or CMA we can allocate gigantic pages at runtime */ - if (cpu_has_gbpages && !size_to_hstate(1UL << PUD_SHIFT)) + if (boot_cpu_has(X86_FEATURE_GBPAGES) && !size_to_hstate(1UL << PUD_SHIFT)) hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); return 0; } diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c new file mode 100644 index 000000000000..ec21796ac5fd --- /dev/null +++ b/arch/x86/mm/ident_map.c @@ -0,0 +1,79 @@ +/* + * Helper routines for building identity mapping page tables. This is + * included by both the compressed kernel and the regular kernel. + */ + +static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page, + unsigned long addr, unsigned long end) +{ + addr &= PMD_MASK; + for (; addr < end; addr += PMD_SIZE) { + pmd_t *pmd = pmd_page + pmd_index(addr); + + if (!pmd_present(*pmd)) + set_pmd(pmd, __pmd(addr | pmd_flag)); + } +} + +static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next) { + pud_t *pud = pud_page + pud_index(addr); + pmd_t *pmd; + + next = (addr & PUD_MASK) + PUD_SIZE; + if (next > end) + next = end; + + if (pud_present(*pud)) { + pmd = pmd_offset(pud, 0); + ident_pmd_init(info->pmd_flag, pmd, addr, next); + continue; + } + pmd = (pmd_t *)info->alloc_pgt_page(info->context); + if (!pmd) + return -ENOMEM; + ident_pmd_init(info->pmd_flag, pmd, addr, next); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + } + + return 0; +} + +int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + int result; + int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0; + + for (; addr < end; addr = next) { + pgd_t *pgd = pgd_page + pgd_index(addr) + off; + pud_t *pud; + + next = (addr & PGDIR_MASK) + PGDIR_SIZE; + if (next > end) + next = end; + + if (pgd_present(*pgd)) { + pud = pud_offset(pgd, 0); + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + continue; + } + + pud = (pud_t *)info->alloc_pgt_page(info->context); + if (!pud) + return -ENOMEM; + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + } + + return 0; +} diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 9d56f271d519..372aad2b3291 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -157,23 +157,23 @@ static void __init probe_page_size_mask(void) * This will simplify cpa(), which otherwise needs to support splitting * large pages into small in interrupt context, etc. */ - if (cpu_has_pse && !debug_pagealloc_enabled()) + if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled()) page_size_mask |= 1 << PG_LEVEL_2M; #endif /* Enable PSE if available */ - if (cpu_has_pse) + if (boot_cpu_has(X86_FEATURE_PSE)) cr4_set_bits_and_update_boot(X86_CR4_PSE); /* Enable PGE if available */ - if (cpu_has_pge) { + if (boot_cpu_has(X86_FEATURE_PGE)) { cr4_set_bits_and_update_boot(X86_CR4_PGE); __supported_pte_mask |= _PAGE_GLOBAL; } else __supported_pte_mask &= ~_PAGE_GLOBAL; /* Enable 1 GB linear kernel mappings if available: */ - if (direct_gbpages && cpu_has_gbpages) { + if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) { printk(KERN_INFO "Using GB pages for direct mapping\n"); page_size_mask |= 1 << PG_LEVEL_1G; } else { diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index bd7a9b9e2e14..84df150ee77e 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -284,7 +284,7 @@ kernel_physical_mapping_init(unsigned long start, */ mapping_iter = 1; - if (!cpu_has_pse) + if (!boot_cpu_has(X86_FEATURE_PSE)) use_pse = 0; repeat: @@ -804,9 +804,6 @@ void __init mem_init(void) BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END); #undef high_memory #undef __FIXADDR_TOP -#ifdef CONFIG_RANDOMIZE_BASE - BUILD_BUG_ON(CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE); -#endif #ifdef CONFIG_HIGHMEM BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 214afda97911..bce2e5d9edd4 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -58,79 +58,7 @@ #include "mm_internal.h" -static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page, - unsigned long addr, unsigned long end) -{ - addr &= PMD_MASK; - for (; addr < end; addr += PMD_SIZE) { - pmd_t *pmd = pmd_page + pmd_index(addr); - - if (!pmd_present(*pmd)) - set_pmd(pmd, __pmd(addr | pmd_flag)); - } -} -static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, - unsigned long addr, unsigned long end) -{ - unsigned long next; - - for (; addr < end; addr = next) { - pud_t *pud = pud_page + pud_index(addr); - pmd_t *pmd; - - next = (addr & PUD_MASK) + PUD_SIZE; - if (next > end) - next = end; - - if (pud_present(*pud)) { - pmd = pmd_offset(pud, 0); - ident_pmd_init(info->pmd_flag, pmd, addr, next); - continue; - } - pmd = (pmd_t *)info->alloc_pgt_page(info->context); - if (!pmd) - return -ENOMEM; - ident_pmd_init(info->pmd_flag, pmd, addr, next); - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); - } - - return 0; -} - -int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, - unsigned long addr, unsigned long end) -{ - unsigned long next; - int result; - int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0; - - for (; addr < end; addr = next) { - pgd_t *pgd = pgd_page + pgd_index(addr) + off; - pud_t *pud; - - next = (addr & PGDIR_MASK) + PGDIR_SIZE; - if (next > end) - next = end; - - if (pgd_present(*pgd)) { - pud = pud_offset(pgd, 0); - result = ident_pud_init(info, pud, addr, next); - if (result) - return result; - continue; - } - - pud = (pud_t *)info->alloc_pgt_page(info->context); - if (!pud) - return -ENOMEM; - result = ident_pud_init(info, pud, addr, next); - if (result) - return result; - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); - } - - return 0; -} +#include "ident_map.c" /* * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the @@ -1295,7 +1223,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) struct vmem_altmap *altmap = to_vmem_altmap(start); int err; - if (cpu_has_pse) + if (boot_cpu_has(X86_FEATURE_PSE)) err = vmemmap_populate_hugepages(start, end, node, altmap); else if (altmap) { pr_err_once("%s: no cpu support for altmap allocations\n", @@ -1338,7 +1266,7 @@ void register_page_bootmem_memmap(unsigned long section_nr, } get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); - if (!cpu_has_pse) { + if (!boot_cpu_has(X86_FEATURE_PSE)) { next = (addr + PAGE_SIZE) & PAGE_MASK; pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 0d8d53d1f5cc..f0894910bdd7 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -378,7 +378,7 @@ EXPORT_SYMBOL(iounmap); int __init arch_ioremap_pud_supported(void) { #ifdef CONFIG_X86_64 - return cpu_has_gbpages; + return boot_cpu_has(X86_FEATURE_GBPAGES); #else return 0; #endif @@ -386,7 +386,7 @@ int __init arch_ioremap_pud_supported(void) int __init arch_ioremap_pmd_supported(void) { - return cpu_has_pse; + return boot_cpu_has(X86_FEATURE_PSE); } /* diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 01be9ec3bf79..7a1f7bbf4105 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1055,7 +1055,7 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, /* * Map everything starting from the Gb boundary, possibly with 1G pages */ - while (cpu_has_gbpages && end - start >= PUD_SIZE) { + while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) { set_pud(pud, __pud(cpa->pfn << PAGE_SHIFT | _PAGE_PSE | massage_pgprot(pud_pgprot))); @@ -1125,8 +1125,14 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, int primary) { - if (cpa->pgd) + if (cpa->pgd) { + /* + * Right now, we only execute this code path when mapping + * the EFI virtual memory map regions, no other users + * provide a ->pgd value. This may change in the future. + */ return populate_pgd(cpa, vaddr); + } /* * Ignore all non primary paths. @@ -1460,7 +1466,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, * error case we fall back to cpa_flush_all (which uses * WBINVD): */ - if (!ret && cpu_has_clflush) { + if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) { if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { cpa_flush_array(addr, numpages, cache, cpa.flags, pages); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index faec01e7a17d..fb0604f11eec 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -40,11 +40,22 @@ static bool boot_cpu_done; static int __read_mostly __pat_enabled = IS_ENABLED(CONFIG_X86_PAT); +static void init_cache_modes(void); -static inline void pat_disable(const char *reason) +void pat_disable(const char *reason) { + if (!__pat_enabled) + return; + + if (boot_cpu_done) { + WARN_ONCE(1, "x86/PAT: PAT cannot be disabled after initialization\n"); + return; + } + __pat_enabled = 0; pr_info("x86/PAT: %s\n", reason); + + init_cache_modes(); } static int __init nopat(char *str) @@ -181,7 +192,7 @@ static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg) * configuration. * Using lower indices is preferred, so we start with highest index. */ -void pat_init_cache_modes(u64 pat) +static void __init_cache_modes(u64 pat) { enum page_cache_mode cache; char pat_msg[33]; @@ -202,14 +213,11 @@ static void pat_bsp_init(u64 pat) { u64 tmp_pat; - if (!cpu_has_pat) { + if (!boot_cpu_has(X86_FEATURE_PAT)) { pat_disable("PAT not supported by CPU."); return; } - if (!pat_enabled()) - goto done; - rdmsrl(MSR_IA32_CR_PAT, tmp_pat); if (!tmp_pat) { pat_disable("PAT MSR is 0, disabled."); @@ -218,16 +226,12 @@ static void pat_bsp_init(u64 pat) wrmsrl(MSR_IA32_CR_PAT, pat); -done: - pat_init_cache_modes(pat); + __init_cache_modes(pat); } static void pat_ap_init(u64 pat) { - if (!pat_enabled()) - return; - - if (!cpu_has_pat) { + if (!boot_cpu_has(X86_FEATURE_PAT)) { /* * If this happens we are on a secondary CPU, but switched to * PAT on the boot CPU. We have no way to undo PAT. @@ -238,18 +242,32 @@ static void pat_ap_init(u64 pat) wrmsrl(MSR_IA32_CR_PAT, pat); } -void pat_init(void) +static void init_cache_modes(void) { - u64 pat; - struct cpuinfo_x86 *c = &boot_cpu_data; + u64 pat = 0; + static int init_cm_done; - if (!pat_enabled()) { + if (init_cm_done) + return; + + if (boot_cpu_has(X86_FEATURE_PAT)) { + /* + * CPU supports PAT. Set PAT table to be consistent with + * PAT MSR. This case supports "nopat" boot option, and + * virtual machine environments which support PAT without + * MTRRs. In specific, Xen has unique setup to PAT MSR. + * + * If PAT MSR returns 0, it is considered invalid and emulates + * as No PAT. + */ + rdmsrl(MSR_IA32_CR_PAT, pat); + } + + if (!pat) { /* * No PAT. Emulate the PAT table that corresponds to the two - * cache bits, PWT (Write Through) and PCD (Cache Disable). This - * setup is the same as the BIOS default setup when the system - * has PAT but the "nopat" boot option has been specified. This - * emulated PAT table is used when MSR_IA32_CR_PAT returns 0. + * cache bits, PWT (Write Through) and PCD (Cache Disable). + * This setup is also the same as the BIOS default setup. * * PTE encoding: * @@ -266,10 +284,36 @@ void pat_init(void) */ pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) | PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC); + } + + __init_cache_modes(pat); + + init_cm_done = 1; +} + +/** + * pat_init - Initialize PAT MSR and PAT table + * + * This function initializes PAT MSR and PAT table with an OS-defined value + * to enable additional cache attributes, WC and WT. + * + * This function must be called on all CPUs using the specific sequence of + * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this + * procedure for PAT. + */ +void pat_init(void) +{ + u64 pat; + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (!pat_enabled()) { + init_cache_modes(); + return; + } - } else if ((c->x86_vendor == X86_VENDOR_INTEL) && - (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || - ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) { + if ((c->x86_vendor == X86_VENDOR_INTEL) && + (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || + ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) { /* * PAT support with the lower four entries. Intel Pentium 2, * 3, M, and 4 are affected by PAT errata, which makes the @@ -734,25 +778,6 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, if (file->f_flags & O_DSYNC) pcm = _PAGE_CACHE_MODE_UC_MINUS; -#ifdef CONFIG_X86_32 - /* - * On the PPro and successors, the MTRRs are used to set - * memory types for physical addresses outside main memory, - * so blindly setting UC or PWT on those pages is wrong. - * For Pentiums and earlier, the surround logic should disable - * caching for the high addresses through the KEN pin, but - * we maintain the tradition of paranoia in this code. - */ - if (!pat_enabled() && - !(boot_cpu_has(X86_FEATURE_MTRR) || - boot_cpu_has(X86_FEATURE_K6_MTRR) || - boot_cpu_has(X86_FEATURE_CYRIX_ARR) || - boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) && - (pfn << PAGE_SHIFT) >= __pa(high_memory)) { - pcm = _PAGE_CACHE_MODE_UC; - } -#endif - *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | cachemode2protval(pcm)); return 1; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index fe9b9f776361..5643fd0b1a7d 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -28,6 +28,8 @@ * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi */ +#ifdef CONFIG_SMP + struct flush_tlb_info { struct mm_struct *flush_mm; unsigned long flush_start; @@ -57,6 +59,118 @@ void leave_mm(int cpu) } EXPORT_SYMBOL_GPL(leave_mm); +#endif /* CONFIG_SMP */ + +void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + unsigned long flags; + + local_irq_save(flags); + switch_mm_irqs_off(prev, next, tsk); + local_irq_restore(flags); +} + +void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + unsigned cpu = smp_processor_id(); + + if (likely(prev != next)) { +#ifdef CONFIG_SMP + this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); + this_cpu_write(cpu_tlbstate.active_mm, next); +#endif + cpumask_set_cpu(cpu, mm_cpumask(next)); + + /* + * Re-load page tables. + * + * This logic has an ordering constraint: + * + * CPU 0: Write to a PTE for 'next' + * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. + * CPU 1: set bit 1 in next's mm_cpumask + * CPU 1: load from the PTE that CPU 0 writes (implicit) + * + * We need to prevent an outcome in which CPU 1 observes + * the new PTE value and CPU 0 observes bit 1 clear in + * mm_cpumask. (If that occurs, then the IPI will never + * be sent, and CPU 0's TLB will contain a stale entry.) + * + * The bad outcome can occur if either CPU's load is + * reordered before that CPU's store, so both CPUs must + * execute full barriers to prevent this from happening. + * + * Thus, switch_mm needs a full barrier between the + * store to mm_cpumask and any operation that could load + * from next->pgd. TLB fills are special and can happen + * due to instruction fetches or for no reason at all, + * and neither LOCK nor MFENCE orders them. + * Fortunately, load_cr3() is serializing and gives the + * ordering guarantee we need. + * + */ + load_cr3(next->pgd); + + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + + /* Stop flush ipis for the previous mm */ + cpumask_clear_cpu(cpu, mm_cpumask(prev)); + + /* Load per-mm CR4 state */ + load_mm_cr4(next); + +#ifdef CONFIG_MODIFY_LDT_SYSCALL + /* + * Load the LDT, if the LDT is different. + * + * It's possible that prev->context.ldt doesn't match + * the LDT register. This can happen if leave_mm(prev) + * was called and then modify_ldt changed + * prev->context.ldt but suppressed an IPI to this CPU. + * In this case, prev->context.ldt != NULL, because we + * never set context.ldt to NULL while the mm still + * exists. That means that next->context.ldt != + * prev->context.ldt, because mms never share an LDT. + */ + if (unlikely(prev->context.ldt != next->context.ldt)) + load_mm_ldt(next); +#endif + } +#ifdef CONFIG_SMP + else { + this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); + BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next); + + if (!cpumask_test_cpu(cpu, mm_cpumask(next))) { + /* + * On established mms, the mm_cpumask is only changed + * from irq context, from ptep_clear_flush() while in + * lazy tlb mode, and here. Irqs are blocked during + * schedule, protecting us from simultaneous changes. + */ + cpumask_set_cpu(cpu, mm_cpumask(next)); + + /* + * We were in lazy tlb mode and leave_mm disabled + * tlb flush IPI delivery. We must reload CR3 + * to make sure to use no freed page tables. + * + * As above, load_cr3() is serializing and orders TLB + * fills with respect to the mm_cpumask write. + */ + load_cr3(next->pgd); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + load_mm_cr4(next); + load_mm_ldt(next); + } + } +#endif +} + +#ifdef CONFIG_SMP + /* * The flush IPI assumes that a thread switch happens in this order: * [cpu0: the cpu that switches] @@ -353,3 +467,5 @@ static int __init create_tlb_single_page_flush_ceiling(void) return 0; } late_initcall(create_tlb_single_page_flush_ceiling); + +#endif /* CONFIG_SMP */ diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 0e07e0968c3a..28c04123b6dd 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -636,7 +636,7 @@ static int __init ppro_init(char **cpu_type) __u8 cpu_model = boot_cpu_data.x86_model; struct op_x86_model_spec *spec = &op_ppro_spec; /* default */ - if (force_cpu_type == arch_perfmon && cpu_has_arch_perfmon) + if (force_cpu_type == arch_perfmon && boot_cpu_has(X86_FEATURE_ARCH_PERFMON)) return 0; /* @@ -700,7 +700,7 @@ int __init op_nmi_init(struct oprofile_operations *ops) char *cpu_type = NULL; int ret = 0; - if (!cpu_has_apic) + if (!boot_cpu_has(X86_FEATURE_APIC)) return -ENODEV; if (force_cpu_type == timer) @@ -761,7 +761,7 @@ int __init op_nmi_init(struct oprofile_operations *ops) if (cpu_type) break; - if (!cpu_has_arch_perfmon) + if (!boot_cpu_has(X86_FEATURE_ARCH_PERFMON)) return -ENODEV; /* use arch perfmon as fallback */ diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index d90528ea5412..350f7096baac 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -75,7 +75,7 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model, u64 val; int i; - if (cpu_has_arch_perfmon) { + if (boot_cpu_has(X86_FEATURE_ARCH_PERFMON)) { union cpuid10_eax eax; eax.full = cpuid_eax(0xa); diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index beac4dfdade6..4bd08b0fc8ea 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -445,7 +445,7 @@ void __init xen_msi_init(void) uint32_t eax = cpuid_eax(xen_cpuid_base() + 4); if (((eax & XEN_HVM_CPUID_X2APIC_VIRT) && x2apic_mode) || - ((eax & XEN_HVM_CPUID_APIC_ACCESS_VIRT) && cpu_has_apic)) + ((eax & XEN_HVM_CPUID_APIC_ACCESS_VIRT) && boot_cpu_has(X86_FEATURE_APIC))) return; } diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 994a7df84a7b..f93545e7dc54 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -54,10 +54,6 @@ #include <asm/rtc.h> #include <asm/uv/uv.h> -#define EFI_DEBUG - -struct efi_memory_map memmap; - static struct efi efi_phys __initdata; static efi_system_table_t efi_systab __initdata; @@ -119,11 +115,10 @@ void efi_get_time(struct timespec *now) void __init efi_find_mirror(void) { - void *p; + efi_memory_desc_t *md; u64 mirror_size = 0, total_size = 0; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - efi_memory_desc_t *md = p; + for_each_efi_memory_desc(md) { unsigned long long start = md->phys_addr; unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; @@ -146,10 +141,9 @@ void __init efi_find_mirror(void) static void __init do_add_efi_memmap(void) { - void *p; + efi_memory_desc_t *md; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - efi_memory_desc_t *md = p; + for_each_efi_memory_desc(md) { unsigned long long start = md->phys_addr; unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; int e820_type; @@ -209,47 +203,47 @@ int __init efi_memblock_x86_reserve_range(void) #else pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32)); #endif - memmap.phys_map = pmap; - memmap.nr_map = e->efi_memmap_size / + efi.memmap.phys_map = pmap; + efi.memmap.nr_map = e->efi_memmap_size / e->efi_memdesc_size; - memmap.desc_size = e->efi_memdesc_size; - memmap.desc_version = e->efi_memdesc_version; + efi.memmap.desc_size = e->efi_memdesc_size; + efi.memmap.desc_version = e->efi_memdesc_version; - memblock_reserve(pmap, memmap.nr_map * memmap.desc_size); + WARN(efi.memmap.desc_version != 1, + "Unexpected EFI_MEMORY_DESCRIPTOR version %ld", + efi.memmap.desc_version); - efi.memmap = &memmap; + memblock_reserve(pmap, efi.memmap.nr_map * efi.memmap.desc_size); return 0; } void __init efi_print_memmap(void) { -#ifdef EFI_DEBUG efi_memory_desc_t *md; - void *p; - int i; + int i = 0; - for (p = memmap.map, i = 0; - p < memmap.map_end; - p += memmap.desc_size, i++) { + for_each_efi_memory_desc(md) { char buf[64]; - md = p; pr_info("mem%02u: %s range=[0x%016llx-0x%016llx] (%lluMB)\n", - i, efi_md_typeattr_format(buf, sizeof(buf), md), + i++, efi_md_typeattr_format(buf, sizeof(buf), md), md->phys_addr, md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1, (md->num_pages >> (20 - EFI_PAGE_SHIFT))); } -#endif /* EFI_DEBUG */ } void __init efi_unmap_memmap(void) { + unsigned long size; + clear_bit(EFI_MEMMAP, &efi.flags); - if (memmap.map) { - early_memunmap(memmap.map, memmap.nr_map * memmap.desc_size); - memmap.map = NULL; + + size = efi.memmap.nr_map * efi.memmap.desc_size; + if (efi.memmap.map) { + early_memunmap(efi.memmap.map, size); + efi.memmap.map = NULL; } } @@ -352,8 +346,6 @@ static int __init efi_systab_init(void *phys) efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff); - set_bit(EFI_SYSTEM_TABLES, &efi.flags); - return 0; } @@ -440,17 +432,22 @@ static int __init efi_runtime_init(void) static int __init efi_memmap_init(void) { + unsigned long addr, size; + if (efi_enabled(EFI_PARAVIRT)) return 0; /* Map the EFI memory map */ - memmap.map = early_memremap((unsigned long)memmap.phys_map, - memmap.nr_map * memmap.desc_size); - if (memmap.map == NULL) { + size = efi.memmap.nr_map * efi.memmap.desc_size; + addr = (unsigned long)efi.memmap.phys_map; + + efi.memmap.map = early_memremap(addr, size); + if (efi.memmap.map == NULL) { pr_err("Could not map the memory map!\n"); return -ENOMEM; } - memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); + + efi.memmap.map_end = efi.memmap.map + size; if (add_efi_memmap) do_add_efi_memmap(); @@ -552,12 +549,9 @@ void __init efi_set_executable(efi_memory_desc_t *md, bool executable) void __init runtime_code_page_mkexec(void) { efi_memory_desc_t *md; - void *p; /* Make EFI runtime service code area executable */ - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - + for_each_efi_memory_desc(md) { if (md->type != EFI_RUNTIME_SERVICES_CODE) continue; @@ -604,12 +598,10 @@ void __init old_map_region(efi_memory_desc_t *md) /* Merge contiguous regions of the same type and attribute */ static void __init efi_merge_regions(void) { - void *p; efi_memory_desc_t *md, *prev_md = NULL; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + for_each_efi_memory_desc(md) { u64 prev_size; - md = p; if (!prev_md) { prev_md = md; @@ -651,30 +643,31 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md) static void __init save_runtime_map(void) { #ifdef CONFIG_KEXEC_CORE + unsigned long desc_size; efi_memory_desc_t *md; - void *tmp, *p, *q = NULL; + void *tmp, *q = NULL; int count = 0; if (efi_enabled(EFI_OLD_MEMMAP)) return; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; + desc_size = efi.memmap.desc_size; + for_each_efi_memory_desc(md) { if (!(md->attribute & EFI_MEMORY_RUNTIME) || (md->type == EFI_BOOT_SERVICES_CODE) || (md->type == EFI_BOOT_SERVICES_DATA)) continue; - tmp = krealloc(q, (count + 1) * memmap.desc_size, GFP_KERNEL); + tmp = krealloc(q, (count + 1) * desc_size, GFP_KERNEL); if (!tmp) goto out; q = tmp; - memcpy(q + count * memmap.desc_size, md, memmap.desc_size); + memcpy(q + count * desc_size, md, desc_size); count++; } - efi_runtime_map_setup(q, count, memmap.desc_size); + efi_runtime_map_setup(q, count, desc_size); return; out: @@ -714,10 +707,10 @@ static inline void *efi_map_next_entry_reverse(void *entry) { /* Initial call */ if (!entry) - return memmap.map_end - memmap.desc_size; + return efi.memmap.map_end - efi.memmap.desc_size; - entry -= memmap.desc_size; - if (entry < memmap.map) + entry -= efi.memmap.desc_size; + if (entry < efi.memmap.map) return NULL; return entry; @@ -759,10 +752,10 @@ static void *efi_map_next_entry(void *entry) /* Initial call */ if (!entry) - return memmap.map; + return efi.memmap.map; - entry += memmap.desc_size; - if (entry >= memmap.map_end) + entry += efi.memmap.desc_size; + if (entry >= efi.memmap.map_end) return NULL; return entry; @@ -776,8 +769,11 @@ static void * __init efi_map_regions(int *count, int *pg_shift) { void *p, *new_memmap = NULL; unsigned long left = 0; + unsigned long desc_size; efi_memory_desc_t *md; + desc_size = efi.memmap.desc_size; + p = NULL; while ((p = efi_map_next_entry(p))) { md = p; @@ -792,7 +788,7 @@ static void * __init efi_map_regions(int *count, int *pg_shift) efi_map_region(md); get_systab_virt_addr(md); - if (left < memmap.desc_size) { + if (left < desc_size) { new_memmap = realloc_pages(new_memmap, *pg_shift); if (!new_memmap) return NULL; @@ -801,10 +797,9 @@ static void * __init efi_map_regions(int *count, int *pg_shift) (*pg_shift)++; } - memcpy(new_memmap + (*count * memmap.desc_size), md, - memmap.desc_size); + memcpy(new_memmap + (*count * desc_size), md, desc_size); - left -= memmap.desc_size; + left -= desc_size; (*count)++; } @@ -816,7 +811,6 @@ static void __init kexec_enter_virtual_mode(void) #ifdef CONFIG_KEXEC_CORE efi_memory_desc_t *md; unsigned int num_pages; - void *p; efi.systab = NULL; @@ -840,8 +834,7 @@ static void __init kexec_enter_virtual_mode(void) * Map efi regions which were passed via setup_data. The virt_addr is a * fixed addr which was used in first kernel of a kexec boot. */ - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; + for_each_efi_memory_desc(md) { efi_map_region_fixed(md); /* FIXME: add error handling */ get_systab_virt_addr(md); } @@ -850,10 +843,10 @@ static void __init kexec_enter_virtual_mode(void) BUG_ON(!efi.systab); - num_pages = ALIGN(memmap.nr_map * memmap.desc_size, PAGE_SIZE); + num_pages = ALIGN(efi.memmap.nr_map * efi.memmap.desc_size, PAGE_SIZE); num_pages >>= PAGE_SHIFT; - if (efi_setup_page_tables(memmap.phys_map, num_pages)) { + if (efi_setup_page_tables(efi.memmap.phys_map, num_pages)) { clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); return; } @@ -937,16 +930,16 @@ static void __init __efi_enter_virtual_mode(void) if (efi_is_native()) { status = phys_efi_set_virtual_address_map( - memmap.desc_size * count, - memmap.desc_size, - memmap.desc_version, + efi.memmap.desc_size * count, + efi.memmap.desc_size, + efi.memmap.desc_version, (efi_memory_desc_t *)__pa(new_memmap)); } else { status = efi_thunk_set_virtual_address_map( efi_phys.set_virtual_address_map, - memmap.desc_size * count, - memmap.desc_size, - memmap.desc_version, + efi.memmap.desc_size * count, + efi.memmap.desc_size, + efi.memmap.desc_version, (efi_memory_desc_t *)__pa(new_memmap)); } @@ -1011,13 +1004,11 @@ void __init efi_enter_virtual_mode(void) u32 efi_mem_type(unsigned long phys_addr) { efi_memory_desc_t *md; - void *p; if (!efi_enabled(EFI_MEMMAP)) return 0; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; + for_each_efi_memory_desc(md) { if ((md->phys_addr <= phys_addr) && (phys_addr < (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)))) diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 49e4dd4a1f58..6e7242be1c87 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -55,14 +55,12 @@ struct efi_scratch efi_scratch; static void __init early_code_mapping_set_exec(int executable) { efi_memory_desc_t *md; - void *p; if (!(__supported_pte_mask & _PAGE_NX)) return; /* Make EFI service code area executable */ - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; + for_each_efi_memory_desc(md) { if (md->type == EFI_RUNTIME_SERVICES_CODE || md->type == EFI_BOOT_SERVICES_CODE) efi_set_executable(md, executable); @@ -253,7 +251,7 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) * Map all of RAM so that we can access arguments in the 1:1 * mapping when making EFI runtime calls. */ - for_each_efi_memory_desc(&memmap, md) { + for_each_efi_memory_desc(md) { if (md->type != EFI_CONVENTIONAL_MEMORY && md->type != EFI_LOADER_DATA && md->type != EFI_LOADER_CODE) @@ -398,7 +396,6 @@ void __init efi_runtime_update_mappings(void) unsigned long pfn; pgd_t *pgd = efi_pgd; efi_memory_desc_t *md; - void *p; if (efi_enabled(EFI_OLD_MEMMAP)) { if (__supported_pte_mask & _PAGE_NX) @@ -409,9 +406,8 @@ void __init efi_runtime_update_mappings(void) if (!efi_enabled(EFI_NX_PE_DATA)) return; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + for_each_efi_memory_desc(md) { unsigned long pf = 0; - md = p; if (!(md->attribute & EFI_MEMORY_RUNTIME)) continue; diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index ab50ada1d56e..097cb09d917b 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -195,10 +195,9 @@ static bool can_free_region(u64 start, u64 size) */ void __init efi_reserve_boot_services(void) { - void *p; + efi_memory_desc_t *md; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - efi_memory_desc_t *md = p; + for_each_efi_memory_desc(md) { u64 start = md->phys_addr; u64 size = md->num_pages << EFI_PAGE_SHIFT; bool already_reserved; @@ -250,10 +249,9 @@ void __init efi_reserve_boot_services(void) void __init efi_free_boot_services(void) { - void *p; + efi_memory_desc_t *md; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - efi_memory_desc_t *md = p; + for_each_efi_memory_desc(md) { unsigned long long start = md->phys_addr; unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index 1584cbed0dce..815fec6e05e2 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -21,19 +21,20 @@ #include <linux/efi.h> #include <linux/export.h> +#include <linux/slab.h> #include <asm/efi.h> #include <linux/io.h> #include <asm/uv/bios.h> #include <asm/uv/uv_hub.h> -static struct uv_systab uv_systab; +struct uv_systab *uv_systab; s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) { - struct uv_systab *tab = &uv_systab; + struct uv_systab *tab = uv_systab; s64 ret; - if (!tab->function) + if (!tab || !tab->function) /* * BIOS does not support UV systab */ @@ -183,34 +184,31 @@ int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus) } EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target); - #ifdef CONFIG_EFI void uv_bios_init(void) { - struct uv_systab *tab; - - if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || - (efi.uv_systab == (unsigned long)NULL)) { - printk(KERN_CRIT "No EFI UV System Table.\n"); - uv_systab.function = (unsigned long)NULL; + uv_systab = NULL; + if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || !efi.uv_systab) { + pr_crit("UV: UVsystab: missing\n"); return; } - tab = (struct uv_systab *)ioremap(efi.uv_systab, - sizeof(struct uv_systab)); - if (strncmp(tab->signature, "UVST", 4) != 0) - printk(KERN_ERR "bad signature in UV system table!"); - - /* - * Copy table to permanent spot for later use. - */ - memcpy(&uv_systab, tab, sizeof(struct uv_systab)); - iounmap(tab); + uv_systab = ioremap(efi.uv_systab, sizeof(struct uv_systab)); + if (!uv_systab || strncmp(uv_systab->signature, UV_SYSTAB_SIG, 4)) { + pr_err("UV: UVsystab: bad signature!\n"); + iounmap(uv_systab); + return; + } - printk(KERN_INFO "EFI UV System Table Revision %d\n", - uv_systab.revision); + if (uv_systab->revision >= UV_SYSTAB_VERSION_UV4) { + iounmap(uv_systab); + uv_systab = ioremap(efi.uv_systab, uv_systab->size); + if (!uv_systab) { + pr_err("UV: UVsystab: ioremap(%d) failed!\n", + uv_systab->size); + return; + } + } + pr_info("UV: UVsystab: Revision:%x\n", uv_systab->revision); } -#else /* !CONFIG_EFI */ - -void uv_bios_init(void) { } #endif diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 3b6ec42718e4..fdb4d42b4ce5 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -37,7 +37,7 @@ static int timeout_base_ns[] = { }; static int timeout_us; -static int nobau; +static bool nobau = true; static int nobau_perm; static cycles_t congested_cycles; @@ -106,13 +106,28 @@ static char *stat_description[] = { "enable: number times use of the BAU was re-enabled" }; -static int __init -setup_nobau(char *arg) +static int __init setup_bau(char *arg) { - nobau = 1; + int result; + + if (!arg) + return -EINVAL; + + result = strtobool(arg, &nobau); + if (result) + return result; + + /* we need to flip the logic here, so that bau=y sets nobau to false */ + nobau = !nobau; + + if (!nobau) + pr_info("UV BAU Enabled\n"); + else + pr_info("UV BAU Disabled\n"); + return 0; } -early_param("nobau", setup_nobau); +early_param("bau", setup_bau); /* base pnode in this partition */ static int uv_base_pnode __read_mostly; @@ -131,10 +146,10 @@ set_bau_on(void) pr_info("BAU not initialized; cannot be turned on\n"); return; } - nobau = 0; + nobau = false; for_each_present_cpu(cpu) { bcp = &per_cpu(bau_control, cpu); - bcp->nobau = 0; + bcp->nobau = false; } pr_info("BAU turned on\n"); return; @@ -146,10 +161,10 @@ set_bau_off(void) int cpu; struct bau_control *bcp; - nobau = 1; + nobau = true; for_each_present_cpu(cpu) { bcp = &per_cpu(bau_control, cpu); - bcp->nobau = 1; + bcp->nobau = true; } pr_info("BAU turned off\n"); return; @@ -1886,7 +1901,7 @@ static void __init init_per_cpu_tunables(void) bcp = &per_cpu(bau_control, cpu); bcp->baudisabled = 0; if (nobau) - bcp->nobau = 1; + bcp->nobau = true; bcp->statp = &per_cpu(ptcstats, cpu); /* time interval to catch a hardware stay-busy bug */ bcp->timeout_interval = usec_2_cycles(2*timeout_us); @@ -2025,7 +2040,8 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp, return 1; } bcp->uvhub_master = *hmasterp; - bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id; + bcp->uvhub_cpu = uv_cpu_blade_processor_id(cpu); + if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) { printk(KERN_EMERG "%d cpus per uvhub invalid\n", bcp->uvhub_cpu); diff --git a/arch/x86/platform/uv/uv_sysfs.c b/arch/x86/platform/uv/uv_sysfs.c index 5d4ba301e776..e9da9ebd924a 100644 --- a/arch/x86/platform/uv/uv_sysfs.c +++ b/arch/x86/platform/uv/uv_sysfs.c @@ -34,7 +34,7 @@ static ssize_t partition_id_show(struct kobject *kobj, static ssize_t coherence_id_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE, "%ld\n", partition_coherence_id()); + return snprintf(buf, PAGE_SIZE, "%ld\n", uv_partition_coherence_id()); } static struct kobj_attribute partition_id_attr = diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c index 2b158a9fa1d7..b333fc45f9ec 100644 --- a/arch/x86/platform/uv/uv_time.c +++ b/arch/x86/platform/uv/uv_time.c @@ -165,7 +165,7 @@ static __init int uv_rtc_allocate_timers(void) for_each_present_cpu(cpu) { int nid = cpu_to_node(cpu); int bid = uv_cpu_to_blade_id(cpu); - int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; + int bcpu = uv_cpu_blade_processor_id(cpu); struct uv_rtc_timer_head *head = blade_info[bid]; if (!head) { @@ -226,7 +226,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires) int pnode = uv_cpu_to_pnode(cpu); int bid = uv_cpu_to_blade_id(cpu); struct uv_rtc_timer_head *head = blade_info[bid]; - int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; + int bcpu = uv_cpu_blade_processor_id(cpu); u64 *t = &head->cpu[bcpu].expires; unsigned long flags; int next_cpu; @@ -262,7 +262,7 @@ static int uv_rtc_unset_timer(int cpu, int force) int pnode = uv_cpu_to_pnode(cpu); int bid = uv_cpu_to_blade_id(cpu); struct uv_rtc_timer_head *head = blade_info[bid]; - int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; + int bcpu = uv_cpu_blade_processor_id(cpu); u64 *t = &head->cpu[bcpu].expires; unsigned long flags; int rc = 0; diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index 291226b952a9..9f14bd34581d 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -106,7 +106,7 @@ static int resume_physical_mapping_init(pgd_t *pgd_base) * normal page tables. * NOTE: We can mark everything as executable here */ - if (cpu_has_pse) { + if (boot_cpu_has(X86_FEATURE_PSE)) { set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); pfn += PTRS_PER_PTE; } else { diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig index df280da34825..d957d5f21a86 100644 --- a/arch/x86/ras/Kconfig +++ b/arch/x86/ras/Kconfig @@ -1,4 +1,4 @@ -config AMD_MCE_INJ +config MCE_AMD_INJ tristate "Simple MCE injection interface for AMD processors" depends on RAS && EDAC_DECODE_MCE && DEBUG_FS && AMD_NB default n diff --git a/arch/x86/ras/Makefile b/arch/x86/ras/Makefile index dd2c98b84037..5f94546db280 100644 --- a/arch/x86/ras/Makefile +++ b/arch/x86/ras/Makefile @@ -1,2 +1,2 @@ -obj-$(CONFIG_AMD_MCE_INJ) += mce_amd_inj.o +obj-$(CONFIG_MCE_AMD_INJ) += mce_amd_inj.o diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c index 9e02dcaef683..e69f4701a076 100644 --- a/arch/x86/ras/mce_amd_inj.c +++ b/arch/x86/ras/mce_amd_inj.c @@ -290,14 +290,33 @@ static void do_inject(void) wrmsr_on_cpu(cpu, MSR_IA32_MCG_STATUS, (u32)mcg_status, (u32)(mcg_status >> 32)); - wrmsr_on_cpu(cpu, MSR_IA32_MCx_STATUS(b), - (u32)i_mce.status, (u32)(i_mce.status >> 32)); + if (boot_cpu_has(X86_FEATURE_SMCA)) { + if (inj_type == DFR_INT_INJ) { + wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_DESTAT(b), + (u32)i_mce.status, (u32)(i_mce.status >> 32)); + + wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_DEADDR(b), + (u32)i_mce.addr, (u32)(i_mce.addr >> 32)); + } else { + wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_STATUS(b), + (u32)i_mce.status, (u32)(i_mce.status >> 32)); + + wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_ADDR(b), + (u32)i_mce.addr, (u32)(i_mce.addr >> 32)); + } + + wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(b), + (u32)i_mce.misc, (u32)(i_mce.misc >> 32)); + } else { + wrmsr_on_cpu(cpu, MSR_IA32_MCx_STATUS(b), + (u32)i_mce.status, (u32)(i_mce.status >> 32)); - wrmsr_on_cpu(cpu, MSR_IA32_MCx_ADDR(b), - (u32)i_mce.addr, (u32)(i_mce.addr >> 32)); + wrmsr_on_cpu(cpu, MSR_IA32_MCx_ADDR(b), + (u32)i_mce.addr, (u32)(i_mce.addr >> 32)); - wrmsr_on_cpu(cpu, MSR_IA32_MCx_MISC(b), - (u32)i_mce.misc, (u32)(i_mce.misc >> 32)); + wrmsr_on_cpu(cpu, MSR_IA32_MCx_MISC(b), + (u32)i_mce.misc, (u32)(i_mce.misc >> 32)); + } toggle_hw_mce_inject(cpu, false); diff --git a/arch/x86/tools/calc_run_size.sh b/arch/x86/tools/calc_run_size.sh deleted file mode 100644 index 1a4c17bb3910..000000000000 --- a/arch/x86/tools/calc_run_size.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/sh -# -# Calculate the amount of space needed to run the kernel, including room for -# the .bss and .brk sections. -# -# Usage: -# objdump -h a.out | sh calc_run_size.sh - -NUM='\([0-9a-fA-F]*[ \t]*\)' -OUT=$(sed -n 's/^[ \t0-9]*.b[sr][sk][ \t]*'"$NUM$NUM$NUM$NUM"'.*/\1\4/p') -if [ -z "$OUT" ] ; then - echo "Never found .bss or .brk file offset" >&2 - exit 1 -fi - -OUT=$(echo ${OUT# }) -sizeA=$(printf "%d" 0x${OUT%% *}) -OUT=${OUT#* } -offsetA=$(printf "%d" 0x${OUT%% *}) -OUT=${OUT#* } -sizeB=$(printf "%d" 0x${OUT%% *}) -OUT=${OUT#* } -offsetB=$(printf "%d" 0x${OUT%% *}) - -run_size=$(( $offsetA + $sizeA + $sizeB )) - -# BFD linker shows the same file offset in ELF. -if [ "$offsetA" -ne "$offsetB" ] ; then - # Gold linker shows them as consecutive. - endB=$(( $offsetB + $sizeB )) - if [ "$endB" != "$run_size" ] ; then - printf "sizeA: 0x%x\n" $sizeA >&2 - printf "offsetA: 0x%x\n" $offsetA >&2 - printf "sizeB: 0x%x\n" $sizeB >&2 - printf "offsetB: 0x%x\n" $offsetB >&2 - echo ".bss and .brk are non-contiguous" >&2 - exit 1 - fi -fi - -printf "%d\n" $run_size -exit 0 diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 880862c7d9dd..760789ae8562 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -75,7 +75,6 @@ #include <asm/mach_traps.h> #include <asm/mwait.h> #include <asm/pci_x86.h> -#include <asm/pat.h> #include <asm/cpu.h> #ifdef CONFIG_ACPI @@ -1093,6 +1092,26 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) return ret; } +static u64 xen_read_msr(unsigned int msr) +{ + /* + * This will silently swallow a #GP from RDMSR. It may be worth + * changing that. + */ + int err; + + return xen_read_msr_safe(msr, &err); +} + +static void xen_write_msr(unsigned int msr, unsigned low, unsigned high) +{ + /* + * This will silently swallow a #GP from WRMSR. It may be worth + * changing that. + */ + xen_write_msr_safe(msr, low, high); +} + void xen_setup_shared_info(void) { if (!xen_feature(XENFEAT_auto_translated_physmap)) { @@ -1187,13 +1206,11 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, } static const struct pv_info xen_info __initconst = { - .paravirt_enabled = 1, .shared_kernel_pmd = 0, #ifdef CONFIG_X86_64 .extra_user_64bit_cs = FLAT_USER_CS64, #endif - .features = 0, .name = "Xen", }; @@ -1223,8 +1240,11 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .wbinvd = native_wbinvd, - .read_msr = xen_read_msr_safe, - .write_msr = xen_write_msr_safe, + .read_msr = xen_read_msr, + .write_msr = xen_write_msr, + + .read_msr_safe = xen_read_msr_safe, + .write_msr_safe = xen_write_msr_safe, .read_pmc = xen_read_pmc, @@ -1469,10 +1489,10 @@ static void xen_pvh_set_cr_flags(int cpu) * For BSP, PSE PGE are set in probe_page_size_mask(), for APs * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu__init_cpu(). */ - if (cpu_has_pse) + if (boot_cpu_has(X86_FEATURE_PSE)) cr4_set_bits_and_update_boot(X86_CR4_PSE); - if (cpu_has_pge) + if (boot_cpu_has(X86_FEATURE_PGE)) cr4_set_bits_and_update_boot(X86_CR4_PGE); } @@ -1506,12 +1526,16 @@ static void __init xen_pvh_early_guest_init(void) } #endif /* CONFIG_XEN_PVH */ +static void __init xen_dom0_set_legacy_features(void) +{ + x86_platform.legacy.rtc = 1; +} + /* First C function to be called on Xen boot */ asmlinkage __visible void __init xen_start_kernel(void) { struct physdev_set_iopl set_iopl; unsigned long initrd_start = 0; - u64 pat; int rc; if (!xen_start_info) @@ -1527,8 +1551,6 @@ asmlinkage __visible void __init xen_start_kernel(void) /* Install Xen paravirt ops */ pv_info = xen_info; - if (xen_initial_domain()) - pv_info.features |= PV_SUPPORTED_RTC; pv_init_ops = xen_init_ops; if (!xen_pvh_domain()) { pv_cpu_ops = xen_cpu_ops; @@ -1618,13 +1640,6 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_start_info->nr_pages); xen_reserve_special_pages(); - /* - * Modify the cache mode translation tables to match Xen's PAT - * configuration. - */ - rdmsrl(MSR_IA32_CR_PAT, pat); - pat_init_cache_modes(pat); - /* keep using Xen gdt for now; no urgent need to change it */ #ifdef CONFIG_X86_32 @@ -1670,6 +1685,7 @@ asmlinkage __visible void __init xen_start_kernel(void) boot_params.hdr.ramdisk_image = initrd_start; boot_params.hdr.ramdisk_size = xen_start_info->mod_len; boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); + boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN; if (!xen_initial_domain()) { add_preferred_console("xenboot", 0, NULL); @@ -1687,6 +1703,8 @@ asmlinkage __visible void __init xen_start_kernel(void) .u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS, }; + x86_platform.set_legacy_features = + xen_dom0_set_legacy_features; xen_init_vga(info, xen_start_info->console.dom0.info_size); xen_start_info->console.domU.mfn = 0; xen_start_info->console.domU.evtchn = 0; diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index b56855a1382a..28cf4c5d65ef 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -22,6 +22,7 @@ generic-y += mm-arch-hooks.h generic-y += percpu.h generic-y += preempt.h generic-y += resource.h +generic-y += rwsem.h generic-y += sections.h generic-y += siginfo.h generic-y += statfs.h diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h deleted file mode 100644 index 249619e7e7f2..000000000000 --- a/arch/xtensa/include/asm/rwsem.h +++ /dev/null @@ -1,131 +0,0 @@ -/* - * include/asm-xtensa/rwsem.h - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Largely copied from include/asm-ppc/rwsem.h - * - * Copyright (C) 2001 - 2005 Tensilica Inc. - */ - -#ifndef _XTENSA_RWSEM_H -#define _XTENSA_RWSEM_H - -#ifndef _LINUX_RWSEM_H -#error "Please don't include <asm/rwsem.h> directly, use <linux/rwsem.h> instead." -#endif - -#define RWSEM_UNLOCKED_VALUE 0x00000000 -#define RWSEM_ACTIVE_BIAS 0x00000001 -#define RWSEM_ACTIVE_MASK 0x0000ffff -#define RWSEM_WAITING_BIAS (-0x00010000) -#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS -#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - -/* - * lock for reading - */ -static inline void __down_read(struct rw_semaphore *sem) -{ - if (atomic_add_return(1,(atomic_t *)(&sem->count)) > 0) - smp_wmb(); - else - rwsem_down_read_failed(sem); -} - -static inline int __down_read_trylock(struct rw_semaphore *sem) -{ - int tmp; - - while ((tmp = sem->count) >= 0) { - if (tmp == cmpxchg(&sem->count, tmp, - tmp + RWSEM_ACTIVE_READ_BIAS)) { - smp_wmb(); - return 1; - } - } - return 0; -} - -/* - * lock for writing - */ -static inline void __down_write(struct rw_semaphore *sem) -{ - int tmp; - - tmp = atomic_add_return(RWSEM_ACTIVE_WRITE_BIAS, - (atomic_t *)(&sem->count)); - if (tmp == RWSEM_ACTIVE_WRITE_BIAS) - smp_wmb(); - else - rwsem_down_write_failed(sem); -} - -static inline int __down_write_trylock(struct rw_semaphore *sem) -{ - int tmp; - - tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); - smp_wmb(); - return tmp == RWSEM_UNLOCKED_VALUE; -} - -/* - * unlock after reading - */ -static inline void __up_read(struct rw_semaphore *sem) -{ - int tmp; - - smp_wmb(); - tmp = atomic_sub_return(1,(atomic_t *)(&sem->count)); - if (tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0) - rwsem_wake(sem); -} - -/* - * unlock after writing - */ -static inline void __up_write(struct rw_semaphore *sem) -{ - smp_wmb(); - if (atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS, - (atomic_t *)(&sem->count)) < 0) - rwsem_wake(sem); -} - -/* - * implement atomic add functionality - */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) -{ - atomic_add(delta, (atomic_t *)(&sem->count)); -} - -/* - * downgrade write lock to read lock - */ -static inline void __downgrade_write(struct rw_semaphore *sem) -{ - int tmp; - - smp_wmb(); - tmp = atomic_add_return(-RWSEM_WAITING_BIAS, (atomic_t *)(&sem->count)); - if (tmp < 0) - rwsem_downgrade_wake(sem); -} - -/* - * implement exchange and add functionality - */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) -{ - smp_mb(); - return atomic_add_return(delta, (atomic_t *)(&sem->count)); -} - -#endif /* _XTENSA_RWSEM_H */ diff --git a/arch/xtensa/kernel/perf_event.c b/arch/xtensa/kernel/perf_event.c index 54f01188c29c..a6b00b3af429 100644 --- a/arch/xtensa/kernel/perf_event.c +++ b/arch/xtensa/kernel/perf_event.c @@ -332,14 +332,14 @@ static int callchain_trace(struct stackframe *frame, void *data) void perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) { - xtensa_backtrace_kernel(regs, PERF_MAX_STACK_DEPTH, + xtensa_backtrace_kernel(regs, sysctl_perf_event_max_stack, callchain_trace, NULL, entry); } void perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) { - xtensa_backtrace_user(regs, PERF_MAX_STACK_DEPTH, + xtensa_backtrace_user(regs, sysctl_perf_event_max_stack, callchain_trace, entry); } diff --git a/block/blk-map.c b/block/blk-map.c index a54f0543b956..b9f88b7751fb 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -9,24 +9,6 @@ #include "blk.h" -static bool iovec_gap_to_prv(struct request_queue *q, - struct iovec *prv, struct iovec *cur) -{ - unsigned long prev_end; - - if (!queue_virt_boundary(q)) - return false; - - if (prv->iov_base == NULL && prv->iov_len == 0) - /* prv is not set - don't check */ - return false; - - prev_end = (unsigned long)(prv->iov_base + prv->iov_len); - - return (((unsigned long)cur->iov_base & queue_virt_boundary(q)) || - prev_end & queue_virt_boundary(q)); -} - int blk_rq_append_bio(struct request_queue *q, struct request *rq, struct bio *bio) { @@ -125,31 +107,18 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, struct rq_map_data *map_data, const struct iov_iter *iter, gfp_t gfp_mask) { - struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0}; - bool copy = (q->dma_pad_mask & iter->count) || map_data; + bool copy = false; + unsigned long align = q->dma_pad_mask | queue_dma_alignment(q); struct bio *bio = NULL; struct iov_iter i; int ret; - if (!iter || !iter->count) - return -EINVAL; - - iov_for_each(iov, i, *iter) { - unsigned long uaddr = (unsigned long) iov.iov_base; - - if (!iov.iov_len) - return -EINVAL; - - /* - * Keep going so we check length of all segments - */ - if ((uaddr & queue_dma_alignment(q)) || - iovec_gap_to_prv(q, &prv, &iov)) - copy = true; - - prv.iov_base = iov.iov_base; - prv.iov_len = iov.iov_len; - } + if (map_data) + copy = true; + else if (iov_iter_alignment(iter) & align) + copy = true; + else if (queue_virt_boundary(q)) + copy = queue_virt_boundary(q) & iov_iter_gap_alignment(iter); i = *iter; do { diff --git a/crypto/testmgr.c b/crypto/testmgr.c index b86883aedca1..7d4acc449233 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -1776,6 +1776,7 @@ static int alg_test_drbg(const struct alg_test_desc *desc, const char *driver, static int do_test_rsa(struct crypto_akcipher *tfm, struct akcipher_testvec *vecs) { + char *xbuf[XBUFSIZE]; struct akcipher_request *req; void *outbuf_enc = NULL; void *outbuf_dec = NULL; @@ -1784,9 +1785,12 @@ static int do_test_rsa(struct crypto_akcipher *tfm, int err = -ENOMEM; struct scatterlist src, dst, src_tab[2]; + if (testmgr_alloc_buf(xbuf)) + return err; + req = akcipher_request_alloc(tfm, GFP_KERNEL); if (!req) - return err; + goto free_xbuf; init_completion(&result.completion); @@ -1804,9 +1808,14 @@ static int do_test_rsa(struct crypto_akcipher *tfm, if (!outbuf_enc) goto free_req; + if (WARN_ON(vecs->m_size > PAGE_SIZE)) + goto free_all; + + memcpy(xbuf[0], vecs->m, vecs->m_size); + sg_init_table(src_tab, 2); - sg_set_buf(&src_tab[0], vecs->m, 8); - sg_set_buf(&src_tab[1], vecs->m + 8, vecs->m_size - 8); + sg_set_buf(&src_tab[0], xbuf[0], 8); + sg_set_buf(&src_tab[1], xbuf[0] + 8, vecs->m_size - 8); sg_init_one(&dst, outbuf_enc, out_len_max); akcipher_request_set_crypt(req, src_tab, &dst, vecs->m_size, out_len_max); @@ -1825,7 +1834,7 @@ static int do_test_rsa(struct crypto_akcipher *tfm, goto free_all; } /* verify that encrypted message is equal to expected */ - if (memcmp(vecs->c, sg_virt(req->dst), vecs->c_size)) { + if (memcmp(vecs->c, outbuf_enc, vecs->c_size)) { pr_err("alg: rsa: encrypt test failed. Invalid output\n"); err = -EINVAL; goto free_all; @@ -1840,7 +1849,13 @@ static int do_test_rsa(struct crypto_akcipher *tfm, err = -ENOMEM; goto free_all; } - sg_init_one(&src, vecs->c, vecs->c_size); + + if (WARN_ON(vecs->c_size > PAGE_SIZE)) + goto free_all; + + memcpy(xbuf[0], vecs->c, vecs->c_size); + + sg_init_one(&src, xbuf[0], vecs->c_size); sg_init_one(&dst, outbuf_dec, out_len_max); init_completion(&result.completion); akcipher_request_set_crypt(req, &src, &dst, vecs->c_size, out_len_max); @@ -1867,6 +1882,8 @@ free_all: kfree(outbuf_enc); free_req: akcipher_request_free(req); +free_xbuf: + testmgr_free_buf(xbuf); return err; } diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h index 5c79526245c2..a0380338946a 100644 --- a/drivers/base/regmap/internal.h +++ b/drivers/base/regmap/internal.h @@ -13,6 +13,7 @@ #ifndef _REGMAP_INTERNAL_H #define _REGMAP_INTERNAL_H +#include <linux/device.h> #include <linux/regmap.h> #include <linux/fs.h> #include <linux/list.h> diff --git a/drivers/base/regmap/regmap-mmio.c b/drivers/base/regmap/regmap-mmio.c index 7526906ca080..5189fd6182f6 100644 --- a/drivers/base/regmap/regmap-mmio.c +++ b/drivers/base/regmap/regmap-mmio.c @@ -23,6 +23,8 @@ #include <linux/regmap.h> #include <linux/slab.h> +#include "internal.h" + struct regmap_mmio_context { void __iomem *regs; unsigned val_bytes; @@ -212,6 +214,7 @@ static const struct regmap_bus regmap_mmio = { .reg_write = regmap_mmio_write, .reg_read = regmap_mmio_read, .free_context = regmap_mmio_free_context, + .val_format_endian_default = REGMAP_ENDIAN_LITTLE, }; static struct regmap_mmio_context *regmap_mmio_gen_context(struct device *dev, @@ -245,7 +248,7 @@ static struct regmap_mmio_context *regmap_mmio_gen_context(struct device *dev, ctx->val_bytes = config->val_bits / 8; ctx->clk = ERR_PTR(-ENODEV); - switch (config->reg_format_endian) { + switch (regmap_get_val_endian(dev, ®map_mmio, config)) { case REGMAP_ENDIAN_DEFAULT: case REGMAP_ENDIAN_LITTLE: #ifdef __LITTLE_ENDIAN diff --git a/drivers/base/regmap/regmap-spmi.c b/drivers/base/regmap/regmap-spmi.c index 7e58f6560399..4a36e415e938 100644 --- a/drivers/base/regmap/regmap-spmi.c +++ b/drivers/base/regmap/regmap-spmi.c @@ -142,7 +142,7 @@ static int regmap_spmi_ext_read(void *context, while (val_size) { len = min_t(size_t, val_size, 8); - err = spmi_ext_register_readl(context, addr, val, val_size); + err = spmi_ext_register_readl(context, addr, val, len); if (err) goto err_out; diff --git a/drivers/cpufreq/longhaul.c b/drivers/cpufreq/longhaul.c index 0f6b229afcb9..247bfa8eaddb 100644 --- a/drivers/cpufreq/longhaul.c +++ b/drivers/cpufreq/longhaul.c @@ -945,7 +945,7 @@ static int __init longhaul_init(void) } #endif #ifdef CONFIG_X86_IO_APIC - if (cpu_has_apic) { + if (boot_cpu_has(X86_FEATURE_APIC)) { printk(KERN_ERR PFX "APIC detected. Longhaul is currently " "broken in this configuration.\n"); return -ENODEV; diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 49768c08ac07..9b6800a79c7f 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -1052,7 +1052,6 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) struct mce *m = (struct mce *)data; struct cpuinfo_x86 *c = &cpu_data(m->extcpu); int ecc; - u32 ebx = cpuid_ebx(0x80000007); if (amd_filter_mce(m)) return NOTIFY_STOP; @@ -1075,7 +1074,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) ((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"), ((m->status & MCI_STATUS_POISON) ? "Poison" : "-")); - if (!!(ebx & BIT(3))) { + if (boot_cpu_has(X86_FEATURE_SMCA)) { u32 low, high; u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); @@ -1094,7 +1093,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) if (m->status & MCI_STATUS_ADDRV) pr_emerg(HW_ERR "MC%d Error Address: 0x%016llx\n", m->bank, m->addr); - if (!!(ebx & BIT(3))) { + if (boot_cpu_has(X86_FEATURE_SMCA)) { decode_smca_errors(m); goto err_code; } @@ -1149,7 +1148,6 @@ static struct notifier_block amd_mce_dec_nb = { static int __init mce_amd_init(void) { struct cpuinfo_x86 *c = &boot_cpu_data; - u32 ebx; if (c->x86_vendor != X86_VENDOR_AMD) return -ENODEV; @@ -1205,9 +1203,8 @@ static int __init mce_amd_init(void) break; case 0x17: - ebx = cpuid_ebx(0x80000007); xec_mask = 0x3f; - if (!(ebx & BIT(3))) { + if (!boot_cpu_has(X86_FEATURE_SMCA)) { printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n"); goto err_out; } diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index e1670d533f97..6394152f648f 100644 --- a/drivers/firmware/efi/Kconfig +++ b/drivers/firmware/efi/Kconfig @@ -87,6 +87,31 @@ config EFI_RUNTIME_WRAPPERS config EFI_ARMSTUB bool +config EFI_BOOTLOADER_CONTROL + tristate "EFI Bootloader Control" + depends on EFI_VARS + default n + ---help--- + This module installs a reboot hook, such that if reboot() is + invoked with a string argument NNN, "NNN" is copied to the + "LoaderEntryOneShot" EFI variable, to be read by the + bootloader. If the string matches one of the boot labels + defined in its configuration, the bootloader will boot once + to that label. The "LoaderEntryRebootReason" EFI variable is + set with the reboot reason: "reboot" or "shutdown". The + bootloader reads this reboot reason and takes particular + action according to its policy. + +config EFI_CAPSULE_LOADER + tristate "EFI capsule loader" + depends on EFI + help + This option exposes a loader interface "/dev/efi_capsule_loader" for + users to load EFI capsules. This driver requires working runtime + capsule support in the firmware, which many OEMs do not provide. + + Most users should say N. + endmenu config UEFI_CPER diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile index 62e654f255f4..a219640f881f 100644 --- a/drivers/firmware/efi/Makefile +++ b/drivers/firmware/efi/Makefile @@ -9,7 +9,8 @@ # KASAN_SANITIZE_runtime-wrappers.o := n -obj-$(CONFIG_EFI) += efi.o vars.o reboot.o +obj-$(CONFIG_EFI) += efi.o vars.o reboot.o memattr.o +obj-$(CONFIG_EFI) += capsule.o obj-$(CONFIG_EFI_VARS) += efivars.o obj-$(CONFIG_EFI_ESRT) += esrt.o obj-$(CONFIG_EFI_VARS_PSTORE) += efi-pstore.o @@ -18,7 +19,9 @@ obj-$(CONFIG_EFI_RUNTIME_MAP) += runtime-map.o obj-$(CONFIG_EFI_RUNTIME_WRAPPERS) += runtime-wrappers.o obj-$(CONFIG_EFI_STUB) += libstub/ obj-$(CONFIG_EFI_FAKE_MEMMAP) += fake_mem.o +obj-$(CONFIG_EFI_BOOTLOADER_CONTROL) += efibc.o arm-obj-$(CONFIG_EFI) := arm-init.o arm-runtime.o obj-$(CONFIG_ARM) += $(arm-obj-y) obj-$(CONFIG_ARM64) += $(arm-obj-y) +obj-$(CONFIG_EFI_CAPSULE_LOADER) += capsule-loader.o diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c index 8714f8c271ba..ef90f0c4b70a 100644 --- a/drivers/firmware/efi/arm-init.c +++ b/drivers/firmware/efi/arm-init.c @@ -11,17 +11,19 @@ * */ +#define pr_fmt(fmt) "efi: " fmt + #include <linux/efi.h> #include <linux/init.h> #include <linux/memblock.h> #include <linux/mm_types.h> #include <linux/of.h> #include <linux/of_fdt.h> +#include <linux/platform_device.h> +#include <linux/screen_info.h> #include <asm/efi.h> -struct efi_memory_map memmap; - u64 efi_system_table; static int __init is_normal_ram(efi_memory_desc_t *md) @@ -40,7 +42,7 @@ static phys_addr_t efi_to_phys(unsigned long addr) { efi_memory_desc_t *md; - for_each_efi_memory_desc(&memmap, md) { + for_each_efi_memory_desc(md) { if (!(md->attribute & EFI_MEMORY_RUNTIME)) continue; if (md->virt_addr == 0) @@ -53,6 +55,36 @@ static phys_addr_t efi_to_phys(unsigned long addr) return addr; } +static __initdata unsigned long screen_info_table = EFI_INVALID_TABLE_ADDR; + +static __initdata efi_config_table_type_t arch_tables[] = { + {LINUX_EFI_ARM_SCREEN_INFO_TABLE_GUID, NULL, &screen_info_table}, + {NULL_GUID, NULL, NULL} +}; + +static void __init init_screen_info(void) +{ + struct screen_info *si; + + if (screen_info_table != EFI_INVALID_TABLE_ADDR) { + si = early_memremap_ro(screen_info_table, sizeof(*si)); + if (!si) { + pr_err("Could not map screen_info config table\n"); + return; + } + screen_info = *si; + early_memunmap(si, sizeof(*si)); + + /* dummycon on ARM needs non-zero values for columns/lines */ + screen_info.orig_video_cols = 80; + screen_info.orig_video_lines = 25; + } + + if (screen_info.orig_video_isVGA == VIDEO_TYPE_EFI && + memblock_is_map_memory(screen_info.lfb_base)) + memblock_mark_nomap(screen_info.lfb_base, screen_info.lfb_size); +} + static int __init uefi_init(void) { efi_char16_t *c16; @@ -85,6 +117,8 @@ static int __init uefi_init(void) efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff); + efi.runtime_version = efi.systab->hdr.revision; + /* Show what we know for posterity */ c16 = early_memremap_ro(efi_to_phys(efi.systab->fw_vendor), sizeof(vendor) * sizeof(efi_char16_t)); @@ -108,7 +142,8 @@ static int __init uefi_init(void) goto out; } retval = efi_config_parse_tables(config_tables, efi.systab->nr_tables, - sizeof(efi_config_table_t), NULL); + sizeof(efi_config_table_t), + arch_tables); early_memunmap(config_tables, table_size); out: @@ -143,7 +178,7 @@ static __init void reserve_regions(void) if (efi_enabled(EFI_DBG)) pr_info("Processing EFI memory map:\n"); - for_each_efi_memory_desc(&memmap, md) { + for_each_efi_memory_desc(md) { paddr = md->phys_addr; npages = md->num_pages; @@ -184,9 +219,9 @@ void __init efi_init(void) efi_system_table = params.system_table; - memmap.phys_map = params.mmap; - memmap.map = early_memremap_ro(params.mmap, params.mmap_size); - if (memmap.map == NULL) { + efi.memmap.phys_map = params.mmap; + efi.memmap.map = early_memremap_ro(params.mmap, params.mmap_size); + if (efi.memmap.map == NULL) { /* * If we are booting via UEFI, the UEFI memory map is the only * description of memory we have, so there is little point in @@ -194,28 +229,37 @@ void __init efi_init(void) */ panic("Unable to map EFI memory map.\n"); } - memmap.map_end = memmap.map + params.mmap_size; - memmap.desc_size = params.desc_size; - memmap.desc_version = params.desc_ver; + efi.memmap.map_end = efi.memmap.map + params.mmap_size; + efi.memmap.desc_size = params.desc_size; + efi.memmap.desc_version = params.desc_ver; + + WARN(efi.memmap.desc_version != 1, + "Unexpected EFI_MEMORY_DESCRIPTOR version %ld", + efi.memmap.desc_version); if (uefi_init() < 0) return; reserve_regions(); - early_memunmap(memmap.map, params.mmap_size); + efi_memattr_init(); + early_memunmap(efi.memmap.map, params.mmap_size); - if (IS_ENABLED(CONFIG_ARM)) { - /* - * ARM currently does not allow ioremap_cache() to be called on - * memory regions that are covered by struct page. So remove the - * UEFI memory map from the linear mapping. - */ - memblock_mark_nomap(params.mmap & PAGE_MASK, - PAGE_ALIGN(params.mmap_size + - (params.mmap & ~PAGE_MASK))); - } else { - memblock_reserve(params.mmap & PAGE_MASK, - PAGE_ALIGN(params.mmap_size + - (params.mmap & ~PAGE_MASK))); - } + memblock_reserve(params.mmap & PAGE_MASK, + PAGE_ALIGN(params.mmap_size + + (params.mmap & ~PAGE_MASK))); + + init_screen_info(); +} + +static int __init register_gop_device(void) +{ + void *pd; + + if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI) + return 0; + + pd = platform_device_register_data(NULL, "efi-framebuffer", 0, + &screen_info, sizeof(screen_info)); + return PTR_ERR_OR_ZERO(pd); } +subsys_initcall(register_gop_device); diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c index 6ae21e41a429..17ccf0a8787a 100644 --- a/drivers/firmware/efi/arm-runtime.c +++ b/drivers/firmware/efi/arm-runtime.c @@ -42,11 +42,13 @@ static struct mm_struct efi_mm = { static bool __init efi_virtmap_init(void) { efi_memory_desc_t *md; + bool systab_found; efi_mm.pgd = pgd_alloc(&efi_mm); init_new_context(NULL, &efi_mm); - for_each_efi_memory_desc(&memmap, md) { + systab_found = false; + for_each_efi_memory_desc(md) { phys_addr_t phys = md->phys_addr; int ret; @@ -64,7 +66,25 @@ static bool __init efi_virtmap_init(void) &phys, ret); return false; } + /* + * If this entry covers the address of the UEFI system table, + * calculate and record its virtual address. + */ + if (efi_system_table >= phys && + efi_system_table < phys + (md->num_pages * EFI_PAGE_SIZE)) { + efi.systab = (void *)(unsigned long)(efi_system_table - + phys + md->virt_addr); + systab_found = true; + } + } + if (!systab_found) { + pr_err("No virtual mapping found for the UEFI System Table\n"); + return false; } + + if (efi_memattr_apply_permissions(&efi_mm, efi_set_mapping_permissions)) + return false; + return true; } @@ -89,26 +109,17 @@ static int __init arm_enable_runtime_services(void) pr_info("Remapping and enabling EFI services.\n"); - mapsize = memmap.map_end - memmap.map; - memmap.map = (__force void *)ioremap_cache(memmap.phys_map, - mapsize); - if (!memmap.map) { - pr_err("Failed to remap EFI memory map\n"); - return -ENOMEM; - } - memmap.map_end = memmap.map + mapsize; - efi.memmap = &memmap; + mapsize = efi.memmap.map_end - efi.memmap.map; - efi.systab = (__force void *)ioremap_cache(efi_system_table, - sizeof(efi_system_table_t)); - if (!efi.systab) { - pr_err("Failed to remap EFI System Table\n"); + efi.memmap.map = memremap(efi.memmap.phys_map, mapsize, MEMREMAP_WB); + if (!efi.memmap.map) { + pr_err("Failed to remap EFI memory map\n"); return -ENOMEM; } - set_bit(EFI_SYSTEM_TABLES, &efi.flags); + efi.memmap.map_end = efi.memmap.map + mapsize; if (!efi_virtmap_init()) { - pr_err("No UEFI virtual mapping was installed -- runtime services will not be available\n"); + pr_err("UEFI virtual mapping missing or invalid -- runtime services will not be available\n"); return -ENOMEM; } @@ -116,8 +127,6 @@ static int __init arm_enable_runtime_services(void) efi_native_runtime_setup(); set_bit(EFI_RUNTIME_SERVICES, &efi.flags); - efi.runtime_version = efi.systab->hdr.revision; - return 0; } early_initcall(arm_enable_runtime_services); diff --git a/drivers/firmware/efi/capsule-loader.c b/drivers/firmware/efi/capsule-loader.c new file mode 100644 index 000000000000..c99c24bc79b0 --- /dev/null +++ b/drivers/firmware/efi/capsule-loader.c @@ -0,0 +1,343 @@ +/* + * EFI capsule loader driver. + * + * Copyright 2015 Intel Corporation + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + */ + +#define pr_fmt(fmt) "efi: " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/miscdevice.h> +#include <linux/highmem.h> +#include <linux/slab.h> +#include <linux/mutex.h> +#include <linux/efi.h> + +#define NO_FURTHER_WRITE_ACTION -1 + +struct capsule_info { + bool header_obtained; + int reset_type; + long index; + size_t count; + size_t total_size; + struct page **pages; + size_t page_bytes_remain; +}; + +/** + * efi_free_all_buff_pages - free all previous allocated buffer pages + * @cap_info: pointer to current instance of capsule_info structure + * + * In addition to freeing buffer pages, it flags NO_FURTHER_WRITE_ACTION + * to cease processing data in subsequent write(2) calls until close(2) + * is called. + **/ +static void efi_free_all_buff_pages(struct capsule_info *cap_info) +{ + while (cap_info->index > 0) + __free_page(cap_info->pages[--cap_info->index]); + + cap_info->index = NO_FURTHER_WRITE_ACTION; +} + +/** + * efi_capsule_setup_info - obtain the efi capsule header in the binary and + * setup capsule_info structure + * @cap_info: pointer to current instance of capsule_info structure + * @kbuff: a mapped first page buffer pointer + * @hdr_bytes: the total received number of bytes for efi header + **/ +static ssize_t efi_capsule_setup_info(struct capsule_info *cap_info, + void *kbuff, size_t hdr_bytes) +{ + efi_capsule_header_t *cap_hdr; + size_t pages_needed; + int ret; + void *temp_page; + + /* Only process data block that is larger than efi header size */ + if (hdr_bytes < sizeof(efi_capsule_header_t)) + return 0; + + /* Reset back to the correct offset of header */ + cap_hdr = kbuff - cap_info->count; + pages_needed = ALIGN(cap_hdr->imagesize, PAGE_SIZE) >> PAGE_SHIFT; + + if (pages_needed == 0) { + pr_err("%s: pages count invalid\n", __func__); + return -EINVAL; + } + + /* Check if the capsule binary supported */ + ret = efi_capsule_supported(cap_hdr->guid, cap_hdr->flags, + cap_hdr->imagesize, + &cap_info->reset_type); + if (ret) { + pr_err("%s: efi_capsule_supported() failed\n", + __func__); + return ret; + } + + cap_info->total_size = cap_hdr->imagesize; + temp_page = krealloc(cap_info->pages, + pages_needed * sizeof(void *), + GFP_KERNEL | __GFP_ZERO); + if (!temp_page) { + pr_debug("%s: krealloc() failed\n", __func__); + return -ENOMEM; + } + + cap_info->pages = temp_page; + cap_info->header_obtained = true; + + return 0; +} + +/** + * efi_capsule_submit_update - invoke the efi_capsule_update API once binary + * upload done + * @cap_info: pointer to current instance of capsule_info structure + **/ +static ssize_t efi_capsule_submit_update(struct capsule_info *cap_info) +{ + int ret; + void *cap_hdr_temp; + + cap_hdr_temp = kmap(cap_info->pages[0]); + if (!cap_hdr_temp) { + pr_debug("%s: kmap() failed\n", __func__); + return -EFAULT; + } + + ret = efi_capsule_update(cap_hdr_temp, cap_info->pages); + kunmap(cap_info->pages[0]); + if (ret) { + pr_err("%s: efi_capsule_update() failed\n", __func__); + return ret; + } + + /* Indicate capsule binary uploading is done */ + cap_info->index = NO_FURTHER_WRITE_ACTION; + pr_info("%s: Successfully upload capsule file with reboot type '%s'\n", + __func__, !cap_info->reset_type ? "RESET_COLD" : + cap_info->reset_type == 1 ? "RESET_WARM" : + "RESET_SHUTDOWN"); + return 0; +} + +/** + * efi_capsule_write - store the capsule binary and pass it to + * efi_capsule_update() API + * @file: file pointer + * @buff: buffer pointer + * @count: number of bytes in @buff + * @offp: not used + * + * Expectation: + * - A user space tool should start at the beginning of capsule binary and + * pass data in sequentially. + * - Users should close and re-open this file note in order to upload more + * capsules. + * - After an error returned, user should close the file and restart the + * operation for the next try otherwise -EIO will be returned until the + * file is closed. + * - An EFI capsule header must be located at the beginning of capsule + * binary file and passed in as first block data of write operation. + **/ +static ssize_t efi_capsule_write(struct file *file, const char __user *buff, + size_t count, loff_t *offp) +{ + int ret = 0; + struct capsule_info *cap_info = file->private_data; + struct page *page; + void *kbuff = NULL; + size_t write_byte; + + if (count == 0) + return 0; + + /* Return error while NO_FURTHER_WRITE_ACTION is flagged */ + if (cap_info->index < 0) + return -EIO; + + /* Only alloc a new page when previous page is full */ + if (!cap_info->page_bytes_remain) { + page = alloc_page(GFP_KERNEL); + if (!page) { + pr_debug("%s: alloc_page() failed\n", __func__); + ret = -ENOMEM; + goto failed; + } + + cap_info->pages[cap_info->index++] = page; + cap_info->page_bytes_remain = PAGE_SIZE; + } + + page = cap_info->pages[cap_info->index - 1]; + + kbuff = kmap(page); + if (!kbuff) { + pr_debug("%s: kmap() failed\n", __func__); + ret = -EFAULT; + goto failed; + } + kbuff += PAGE_SIZE - cap_info->page_bytes_remain; + + /* Copy capsule binary data from user space to kernel space buffer */ + write_byte = min_t(size_t, count, cap_info->page_bytes_remain); + if (copy_from_user(kbuff, buff, write_byte)) { + pr_debug("%s: copy_from_user() failed\n", __func__); + ret = -EFAULT; + goto fail_unmap; + } + cap_info->page_bytes_remain -= write_byte; + + /* Setup capsule binary info structure */ + if (!cap_info->header_obtained) { + ret = efi_capsule_setup_info(cap_info, kbuff, + cap_info->count + write_byte); + if (ret) + goto fail_unmap; + } + + cap_info->count += write_byte; + kunmap(page); + + /* Submit the full binary to efi_capsule_update() API */ + if (cap_info->header_obtained && + cap_info->count >= cap_info->total_size) { + if (cap_info->count > cap_info->total_size) { + pr_err("%s: upload size exceeded header defined size\n", + __func__); + ret = -EINVAL; + goto failed; + } + + ret = efi_capsule_submit_update(cap_info); + if (ret) + goto failed; + } + + return write_byte; + +fail_unmap: + kunmap(page); +failed: + efi_free_all_buff_pages(cap_info); + return ret; +} + +/** + * efi_capsule_flush - called by file close or file flush + * @file: file pointer + * @id: not used + * + * If a capsule is being partially uploaded then calling this function + * will be treated as upload termination and will free those completed + * buffer pages and -ECANCELED will be returned. + **/ +static int efi_capsule_flush(struct file *file, fl_owner_t id) +{ + int ret = 0; + struct capsule_info *cap_info = file->private_data; + + if (cap_info->index > 0) { + pr_err("%s: capsule upload not complete\n", __func__); + efi_free_all_buff_pages(cap_info); + ret = -ECANCELED; + } + + return ret; +} + +/** + * efi_capsule_release - called by file close + * @inode: not used + * @file: file pointer + * + * We will not free successfully submitted pages since efi update + * requires data to be maintained across system reboot. + **/ +static int efi_capsule_release(struct inode *inode, struct file *file) +{ + struct capsule_info *cap_info = file->private_data; + + kfree(cap_info->pages); + kfree(file->private_data); + file->private_data = NULL; + return 0; +} + +/** + * efi_capsule_open - called by file open + * @inode: not used + * @file: file pointer + * + * Will allocate each capsule_info memory for each file open call. + * This provided the capability to support multiple file open feature + * where user is not needed to wait for others to finish in order to + * upload their capsule binary. + **/ +static int efi_capsule_open(struct inode *inode, struct file *file) +{ + struct capsule_info *cap_info; + + cap_info = kzalloc(sizeof(*cap_info), GFP_KERNEL); + if (!cap_info) + return -ENOMEM; + + cap_info->pages = kzalloc(sizeof(void *), GFP_KERNEL); + if (!cap_info->pages) { + kfree(cap_info); + return -ENOMEM; + } + + file->private_data = cap_info; + + return 0; +} + +static const struct file_operations efi_capsule_fops = { + .owner = THIS_MODULE, + .open = efi_capsule_open, + .write = efi_capsule_write, + .flush = efi_capsule_flush, + .release = efi_capsule_release, + .llseek = no_llseek, +}; + +static struct miscdevice efi_capsule_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = "efi_capsule_loader", + .fops = &efi_capsule_fops, +}; + +static int __init efi_capsule_loader_init(void) +{ + int ret; + + if (!efi_enabled(EFI_RUNTIME_SERVICES)) + return -ENODEV; + + ret = misc_register(&efi_capsule_misc); + if (ret) + pr_err("%s: Failed to register misc char file note\n", + __func__); + + return ret; +} +module_init(efi_capsule_loader_init); + +static void __exit efi_capsule_loader_exit(void) +{ + misc_deregister(&efi_capsule_misc); +} +module_exit(efi_capsule_loader_exit); + +MODULE_DESCRIPTION("EFI capsule firmware binary loader"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/firmware/efi/capsule.c b/drivers/firmware/efi/capsule.c new file mode 100644 index 000000000000..53b9fd2293ee --- /dev/null +++ b/drivers/firmware/efi/capsule.c @@ -0,0 +1,308 @@ +/* + * EFI capsule support. + * + * Copyright 2013 Intel Corporation; author Matt Fleming + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + */ + +#define pr_fmt(fmt) "efi: " fmt + +#include <linux/slab.h> +#include <linux/mutex.h> +#include <linux/highmem.h> +#include <linux/efi.h> +#include <linux/vmalloc.h> +#include <asm/io.h> + +typedef struct { + u64 length; + u64 data; +} efi_capsule_block_desc_t; + +static bool capsule_pending; +static bool stop_capsules; +static int efi_reset_type = -1; + +/* + * capsule_mutex serialises access to both capsule_pending and + * efi_reset_type and stop_capsules. + */ +static DEFINE_MUTEX(capsule_mutex); + +/** + * efi_capsule_pending - has a capsule been passed to the firmware? + * @reset_type: store the type of EFI reset if capsule is pending + * + * To ensure that the registered capsule is processed correctly by the + * firmware we need to perform a specific type of reset. If a capsule is + * pending return the reset type in @reset_type. + * + * This function will race with callers of efi_capsule_update(), for + * example, calling this function while somebody else is in + * efi_capsule_update() but hasn't reached efi_capsue_update_locked() + * will miss the updates to capsule_pending and efi_reset_type after + * efi_capsule_update_locked() completes. + * + * A non-racy use is from platform reboot code because we use + * system_state to ensure no capsules can be sent to the firmware once + * we're at SYSTEM_RESTART. See efi_capsule_update_locked(). + */ +bool efi_capsule_pending(int *reset_type) +{ + if (!capsule_pending) + return false; + + if (reset_type) + *reset_type = efi_reset_type; + + return true; +} + +/* + * Whitelist of EFI capsule flags that we support. + * + * We do not handle EFI_CAPSULE_INITIATE_RESET because that would + * require us to prepare the kernel for reboot. Refuse to load any + * capsules with that flag and any other flags that we do not know how + * to handle. + */ +#define EFI_CAPSULE_SUPPORTED_FLAG_MASK \ + (EFI_CAPSULE_PERSIST_ACROSS_RESET | EFI_CAPSULE_POPULATE_SYSTEM_TABLE) + +/** + * efi_capsule_supported - does the firmware support the capsule? + * @guid: vendor guid of capsule + * @flags: capsule flags + * @size: size of capsule data + * @reset: the reset type required for this capsule + * + * Check whether a capsule with @flags is supported by the firmware + * and that @size doesn't exceed the maximum size for a capsule. + * + * No attempt is made to check @reset against the reset type required + * by any pending capsules because of the races involved. + */ +int efi_capsule_supported(efi_guid_t guid, u32 flags, size_t size, int *reset) +{ + efi_capsule_header_t capsule; + efi_capsule_header_t *cap_list[] = { &capsule }; + efi_status_t status; + u64 max_size; + + if (flags & ~EFI_CAPSULE_SUPPORTED_FLAG_MASK) + return -EINVAL; + + capsule.headersize = capsule.imagesize = sizeof(capsule); + memcpy(&capsule.guid, &guid, sizeof(efi_guid_t)); + capsule.flags = flags; + + status = efi.query_capsule_caps(cap_list, 1, &max_size, reset); + if (status != EFI_SUCCESS) + return efi_status_to_err(status); + + if (size > max_size) + return -ENOSPC; + + return 0; +} +EXPORT_SYMBOL_GPL(efi_capsule_supported); + +/* + * Every scatter gather list (block descriptor) page must end with a + * continuation pointer. The last continuation pointer of the last + * page must be zero to mark the end of the chain. + */ +#define SGLIST_PER_PAGE ((PAGE_SIZE / sizeof(efi_capsule_block_desc_t)) - 1) + +/* + * How many scatter gather list (block descriptor) pages do we need + * to map @count pages? + */ +static inline unsigned int sg_pages_num(unsigned int count) +{ + return DIV_ROUND_UP(count, SGLIST_PER_PAGE); +} + +/** + * efi_capsule_update_locked - pass a single capsule to the firmware + * @capsule: capsule to send to the firmware + * @sg_pages: array of scatter gather (block descriptor) pages + * @reset: the reset type required for @capsule + * + * Since this function must be called under capsule_mutex check + * whether efi_reset_type will conflict with @reset, and atomically + * set it and capsule_pending if a capsule was successfully sent to + * the firmware. + * + * We also check to see if the system is about to restart, and if so, + * abort. This avoids races between efi_capsule_update() and + * efi_capsule_pending(). + */ +static int +efi_capsule_update_locked(efi_capsule_header_t *capsule, + struct page **sg_pages, int reset) +{ + efi_physical_addr_t sglist_phys; + efi_status_t status; + + lockdep_assert_held(&capsule_mutex); + + /* + * If someone has already registered a capsule that requires a + * different reset type, we're out of luck and must abort. + */ + if (efi_reset_type >= 0 && efi_reset_type != reset) { + pr_err("Conflicting capsule reset type %d (%d).\n", + reset, efi_reset_type); + return -EINVAL; + } + + /* + * If the system is getting ready to restart it may have + * called efi_capsule_pending() to make decisions (such as + * whether to force an EFI reboot), and we're racing against + * that call. Abort in that case. + */ + if (unlikely(stop_capsules)) { + pr_warn("Capsule update raced with reboot, aborting.\n"); + return -EINVAL; + } + + sglist_phys = page_to_phys(sg_pages[0]); + + status = efi.update_capsule(&capsule, 1, sglist_phys); + if (status == EFI_SUCCESS) { + capsule_pending = true; + efi_reset_type = reset; + } + + return efi_status_to_err(status); +} + +/** + * efi_capsule_update - send a capsule to the firmware + * @capsule: capsule to send to firmware + * @pages: an array of capsule data pages + * + * Build a scatter gather list with EFI capsule block descriptors to + * map the capsule described by @capsule with its data in @pages and + * send it to the firmware via the UpdateCapsule() runtime service. + * + * @capsule must be a virtual mapping of the first page in @pages + * (@pages[0]) in the kernel address space. That is, a + * capsule_header_t that describes the entire contents of the capsule + * must be at the start of the first data page. + * + * Even though this function will validate that the firmware supports + * the capsule guid, users will likely want to check that + * efi_capsule_supported() returns true before calling this function + * because it makes it easier to print helpful error messages. + * + * If the capsule is successfully submitted to the firmware, any + * subsequent calls to efi_capsule_pending() will return true. @pages + * must not be released or modified if this function returns + * successfully. + * + * Callers must be prepared for this function to fail, which can + * happen if we raced with system reboot or if there is already a + * pending capsule that has a reset type that conflicts with the one + * required by @capsule. Do NOT use efi_capsule_pending() to detect + * this conflict since that would be racy. Instead, submit the capsule + * to efi_capsule_update() and check the return value. + * + * Return 0 on success, a converted EFI status code on failure. + */ +int efi_capsule_update(efi_capsule_header_t *capsule, struct page **pages) +{ + u32 imagesize = capsule->imagesize; + efi_guid_t guid = capsule->guid; + unsigned int count, sg_count; + u32 flags = capsule->flags; + struct page **sg_pages; + int rv, reset_type; + int i, j; + + rv = efi_capsule_supported(guid, flags, imagesize, &reset_type); + if (rv) + return rv; + + count = DIV_ROUND_UP(imagesize, PAGE_SIZE); + sg_count = sg_pages_num(count); + + sg_pages = kzalloc(sg_count * sizeof(*sg_pages), GFP_KERNEL); + if (!sg_pages) + return -ENOMEM; + + for (i = 0; i < sg_count; i++) { + sg_pages[i] = alloc_page(GFP_KERNEL); + if (!sg_pages[i]) { + rv = -ENOMEM; + goto out; + } + } + + for (i = 0; i < sg_count; i++) { + efi_capsule_block_desc_t *sglist; + + sglist = kmap(sg_pages[i]); + if (!sglist) { + rv = -ENOMEM; + goto out; + } + + for (j = 0; j < SGLIST_PER_PAGE && count > 0; j++) { + u64 sz = min_t(u64, imagesize, PAGE_SIZE); + + sglist[j].length = sz; + sglist[j].data = page_to_phys(*pages++); + + imagesize -= sz; + count--; + } + + /* Continuation pointer */ + sglist[j].length = 0; + + if (i + 1 == sg_count) + sglist[j].data = 0; + else + sglist[j].data = page_to_phys(sg_pages[i + 1]); + + kunmap(sg_pages[i]); + } + + mutex_lock(&capsule_mutex); + rv = efi_capsule_update_locked(capsule, sg_pages, reset_type); + mutex_unlock(&capsule_mutex); + +out: + for (i = 0; rv && i < sg_count; i++) { + if (sg_pages[i]) + __free_page(sg_pages[i]); + } + + kfree(sg_pages); + return rv; +} +EXPORT_SYMBOL_GPL(efi_capsule_update); + +static int capsule_reboot_notify(struct notifier_block *nb, unsigned long event, void *cmd) +{ + mutex_lock(&capsule_mutex); + stop_capsules = true; + mutex_unlock(&capsule_mutex); + + return NOTIFY_DONE; +} + +static struct notifier_block capsule_reboot_nb = { + .notifier_call = capsule_reboot_notify, +}; + +static int __init capsule_reboot_register(void) +{ + return register_reboot_notifier(&capsule_reboot_nb); +} +core_initcall(capsule_reboot_register); diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 3a69ed5ecfcb..05509f3aaee8 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -43,6 +43,7 @@ struct efi __read_mostly efi = { .config_table = EFI_INVALID_TABLE_ADDR, .esrt = EFI_INVALID_TABLE_ADDR, .properties_table = EFI_INVALID_TABLE_ADDR, + .mem_attr_table = EFI_INVALID_TABLE_ADDR, }; EXPORT_SYMBOL(efi); @@ -256,7 +257,7 @@ subsys_initcall(efisubsys_init); */ int __init efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md) { - struct efi_memory_map *map = efi.memmap; + struct efi_memory_map *map = &efi.memmap; phys_addr_t p, e; if (!efi_enabled(EFI_MEMMAP)) { @@ -338,6 +339,7 @@ static __initdata efi_config_table_type_t common_tables[] = { {UGA_IO_PROTOCOL_GUID, "UGA", &efi.uga}, {EFI_SYSTEM_RESOURCE_TABLE_GUID, "ESRT", &efi.esrt}, {EFI_PROPERTIES_TABLE_GUID, "PROP", &efi.properties_table}, + {EFI_MEMORY_ATTRIBUTES_TABLE_GUID, "MEMATTR", &efi.mem_attr_table}, {NULL_GUID, NULL, NULL}, }; @@ -351,8 +353,9 @@ static __init int match_config_table(efi_guid_t *guid, for (i = 0; efi_guidcmp(table_types[i].guid, NULL_GUID); i++) { if (!efi_guidcmp(*guid, table_types[i].guid)) { *(table_types[i].ptr) = table; - pr_cont(" %s=0x%lx ", - table_types[i].name, table); + if (table_types[i].name) + pr_cont(" %s=0x%lx ", + table_types[i].name, table); return 1; } } @@ -620,16 +623,12 @@ char * __init efi_md_typeattr_format(char *buf, size_t size, */ u64 __weak efi_mem_attributes(unsigned long phys_addr) { - struct efi_memory_map *map; efi_memory_desc_t *md; - void *p; if (!efi_enabled(EFI_MEMMAP)) return 0; - map = efi.memmap; - for (p = map->map; p < map->map_end; p += map->desc_size) { - md = p; + for_each_efi_memory_desc(md) { if ((md->phys_addr <= phys_addr) && (phys_addr < (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)))) @@ -637,3 +636,36 @@ u64 __weak efi_mem_attributes(unsigned long phys_addr) } return 0; } + +int efi_status_to_err(efi_status_t status) +{ + int err; + + switch (status) { + case EFI_SUCCESS: + err = 0; + break; + case EFI_INVALID_PARAMETER: + err = -EINVAL; + break; + case EFI_OUT_OF_RESOURCES: + err = -ENOSPC; + break; + case EFI_DEVICE_ERROR: + err = -EIO; + break; + case EFI_WRITE_PROTECTED: + err = -EROFS; + break; + case EFI_SECURITY_VIOLATION: + err = -EACCES; + break; + case EFI_NOT_FOUND: + err = -ENOENT; + break; + default: + err = -EINVAL; + } + + return err; +} diff --git a/drivers/firmware/efi/efibc.c b/drivers/firmware/efi/efibc.c new file mode 100644 index 000000000000..8dd0c7085e59 --- /dev/null +++ b/drivers/firmware/efi/efibc.c @@ -0,0 +1,113 @@ +/* + * efibc: control EFI bootloaders which obey LoaderEntryOneShot var + * Copyright (c) 2013-2016, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#define pr_fmt(fmt) "efibc: " fmt + +#include <linux/efi.h> +#include <linux/module.h> +#include <linux/reboot.h> +#include <linux/slab.h> + +static void efibc_str_to_str16(const char *str, efi_char16_t *str16) +{ + size_t i; + + for (i = 0; i < strlen(str); i++) + str16[i] = str[i]; + + str16[i] = '\0'; +} + +static int efibc_set_variable(const char *name, const char *value) +{ + int ret; + efi_guid_t guid = LINUX_EFI_LOADER_ENTRY_GUID; + struct efivar_entry *entry; + size_t size = (strlen(value) + 1) * sizeof(efi_char16_t); + + if (size > sizeof(entry->var.Data)) { + pr_err("value is too large"); + return -EINVAL; + } + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + pr_err("failed to allocate efivar entry"); + return -ENOMEM; + } + + efibc_str_to_str16(name, entry->var.VariableName); + efibc_str_to_str16(value, (efi_char16_t *)entry->var.Data); + memcpy(&entry->var.VendorGuid, &guid, sizeof(guid)); + + ret = efivar_entry_set(entry, + EFI_VARIABLE_NON_VOLATILE + | EFI_VARIABLE_BOOTSERVICE_ACCESS + | EFI_VARIABLE_RUNTIME_ACCESS, + size, entry->var.Data, NULL); + if (ret) + pr_err("failed to set %s EFI variable: 0x%x\n", + name, ret); + + kfree(entry); + return ret; +} + +static int efibc_reboot_notifier_call(struct notifier_block *notifier, + unsigned long event, void *data) +{ + const char *reason = "shutdown"; + int ret; + + if (event == SYS_RESTART) + reason = "reboot"; + + ret = efibc_set_variable("LoaderEntryRebootReason", reason); + if (ret || !data) + return NOTIFY_DONE; + + efibc_set_variable("LoaderEntryOneShot", (char *)data); + + return NOTIFY_DONE; +} + +static struct notifier_block efibc_reboot_notifier = { + .notifier_call = efibc_reboot_notifier_call, +}; + +static int __init efibc_init(void) +{ + int ret; + + if (!efi_enabled(EFI_RUNTIME_SERVICES)) + return -ENODEV; + + ret = register_reboot_notifier(&efibc_reboot_notifier); + if (ret) + pr_err("unable to register reboot notifier\n"); + + return ret; +} +module_init(efibc_init); + +static void __exit efibc_exit(void) +{ + unregister_reboot_notifier(&efibc_reboot_notifier); +} +module_exit(efibc_exit); + +MODULE_AUTHOR("Jeremy Compostella <jeremy.compostella@intel.com>"); +MODULE_AUTHOR("Matt Gumbel <matthew.k.gumbel@intel.com"); +MODULE_DESCRIPTION("EFI Bootloader Control"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/firmware/efi/efivars.c b/drivers/firmware/efi/efivars.c index 096adcbcb5a9..116b244dee68 100644 --- a/drivers/firmware/efi/efivars.c +++ b/drivers/firmware/efi/efivars.c @@ -661,7 +661,7 @@ static void efivar_update_sysfs_entries(struct work_struct *work) return; err = efivar_init(efivar_update_sysfs_entry, entry, - true, false, &efivar_sysfs_list); + false, &efivar_sysfs_list); if (!err) break; @@ -730,8 +730,7 @@ int efivars_sysfs_init(void) return -ENOMEM; } - efivar_init(efivars_sysfs_callback, NULL, false, - true, &efivar_sysfs_list); + efivar_init(efivars_sysfs_callback, NULL, true, &efivar_sysfs_list); error = create_efivars_bin_attributes(); if (error) { diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c index ed3a854950cc..48430aba13c1 100644 --- a/drivers/firmware/efi/fake_mem.c +++ b/drivers/firmware/efi/fake_mem.c @@ -57,7 +57,7 @@ static int __init cmp_fake_mem(const void *x1, const void *x2) void __init efi_fake_memmap(void) { u64 start, end, m_start, m_end, m_attr; - int new_nr_map = memmap.nr_map; + int new_nr_map = efi.memmap.nr_map; efi_memory_desc_t *md; phys_addr_t new_memmap_phy; void *new_memmap; @@ -68,8 +68,7 @@ void __init efi_fake_memmap(void) return; /* count up the number of EFI memory descriptor */ - for (old = memmap.map; old < memmap.map_end; old += memmap.desc_size) { - md = old; + for_each_efi_memory_desc(md) { start = md->phys_addr; end = start + (md->num_pages << EFI_PAGE_SHIFT) - 1; @@ -95,25 +94,25 @@ void __init efi_fake_memmap(void) } /* allocate memory for new EFI memmap */ - new_memmap_phy = memblock_alloc(memmap.desc_size * new_nr_map, + new_memmap_phy = memblock_alloc(efi.memmap.desc_size * new_nr_map, PAGE_SIZE); if (!new_memmap_phy) return; /* create new EFI memmap */ new_memmap = early_memremap(new_memmap_phy, - memmap.desc_size * new_nr_map); + efi.memmap.desc_size * new_nr_map); if (!new_memmap) { - memblock_free(new_memmap_phy, memmap.desc_size * new_nr_map); + memblock_free(new_memmap_phy, efi.memmap.desc_size * new_nr_map); return; } - for (old = memmap.map, new = new_memmap; - old < memmap.map_end; - old += memmap.desc_size, new += memmap.desc_size) { + for (old = efi.memmap.map, new = new_memmap; + old < efi.memmap.map_end; + old += efi.memmap.desc_size, new += efi.memmap.desc_size) { /* copy original EFI memory descriptor */ - memcpy(new, old, memmap.desc_size); + memcpy(new, old, efi.memmap.desc_size); md = new; start = md->phys_addr; end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1; @@ -134,8 +133,8 @@ void __init efi_fake_memmap(void) md->num_pages = (m_end - md->phys_addr + 1) >> EFI_PAGE_SHIFT; /* latter part */ - new += memmap.desc_size; - memcpy(new, old, memmap.desc_size); + new += efi.memmap.desc_size; + memcpy(new, old, efi.memmap.desc_size); md = new; md->phys_addr = m_end + 1; md->num_pages = (end - md->phys_addr + 1) >> @@ -147,16 +146,16 @@ void __init efi_fake_memmap(void) md->num_pages = (m_start - md->phys_addr) >> EFI_PAGE_SHIFT; /* middle part */ - new += memmap.desc_size; - memcpy(new, old, memmap.desc_size); + new += efi.memmap.desc_size; + memcpy(new, old, efi.memmap.desc_size); md = new; md->attribute |= m_attr; md->phys_addr = m_start; md->num_pages = (m_end - m_start + 1) >> EFI_PAGE_SHIFT; /* last part */ - new += memmap.desc_size; - memcpy(new, old, memmap.desc_size); + new += efi.memmap.desc_size; + memcpy(new, old, efi.memmap.desc_size); md = new; md->phys_addr = m_end + 1; md->num_pages = (end - m_end) >> @@ -169,8 +168,8 @@ void __init efi_fake_memmap(void) md->num_pages = (m_start - md->phys_addr) >> EFI_PAGE_SHIFT; /* latter part */ - new += memmap.desc_size; - memcpy(new, old, memmap.desc_size); + new += efi.memmap.desc_size; + memcpy(new, old, efi.memmap.desc_size); md = new; md->phys_addr = m_start; md->num_pages = (end - md->phys_addr + 1) >> @@ -182,10 +181,10 @@ void __init efi_fake_memmap(void) /* swap into new EFI memmap */ efi_unmap_memmap(); - memmap.map = new_memmap; - memmap.phys_map = new_memmap_phy; - memmap.nr_map = new_nr_map; - memmap.map_end = memmap.map + memmap.nr_map * memmap.desc_size; + efi.memmap.map = new_memmap; + efi.memmap.phys_map = new_memmap_phy; + efi.memmap.nr_map = new_nr_map; + efi.memmap.map_end = efi.memmap.map + efi.memmap.nr_map * efi.memmap.desc_size; set_bit(EFI_MEMMAP, &efi.flags); /* print new EFI memmap */ diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index da99bbb74aeb..c06945160a41 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -28,7 +28,7 @@ OBJECT_FILES_NON_STANDARD := y # Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. KCOV_INSTRUMENT := n -lib-y := efi-stub-helper.o +lib-y := efi-stub-helper.o gop.o # include the stub's generic dependencies from lib/ when building for ARM/arm64 arm-deps := fdt_rw.c fdt_ro.c fdt_wip.c fdt.c fdt_empty_tree.c fdt_sw.c sort.c diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c index 414deb85c2e5..993aa56755f6 100644 --- a/drivers/firmware/efi/libstub/arm-stub.c +++ b/drivers/firmware/efi/libstub/arm-stub.c @@ -20,27 +20,49 @@ bool __nokaslr; -static int efi_secureboot_enabled(efi_system_table_t *sys_table_arg) +static int efi_get_secureboot(efi_system_table_t *sys_table_arg) { - static efi_guid_t const var_guid = EFI_GLOBAL_VARIABLE_GUID; - static efi_char16_t const var_name[] = { + static efi_char16_t const sb_var_name[] = { 'S', 'e', 'c', 'u', 'r', 'e', 'B', 'o', 'o', 't', 0 }; + static efi_char16_t const sm_var_name[] = { + 'S', 'e', 't', 'u', 'p', 'M', 'o', 'd', 'e', 0 }; + efi_guid_t var_guid = EFI_GLOBAL_VARIABLE_GUID; efi_get_variable_t *f_getvar = sys_table_arg->runtime->get_variable; - unsigned long size = sizeof(u8); - efi_status_t status; u8 val; + unsigned long size = sizeof(val); + efi_status_t status; - status = f_getvar((efi_char16_t *)var_name, (efi_guid_t *)&var_guid, + status = f_getvar((efi_char16_t *)sb_var_name, (efi_guid_t *)&var_guid, NULL, &size, &val); + if (status != EFI_SUCCESS) + goto out_efi_err; + + if (val == 0) + return 0; + + status = f_getvar((efi_char16_t *)sm_var_name, (efi_guid_t *)&var_guid, + NULL, &size, &val); + + if (status != EFI_SUCCESS) + goto out_efi_err; + + if (val == 1) + return 0; + + return 1; + +out_efi_err: switch (status) { - case EFI_SUCCESS: - return val; case EFI_NOT_FOUND: return 0; + case EFI_DEVICE_ERROR: + return -EIO; + case EFI_SECURITY_VIOLATION: + return -EACCES; default: - return 1; + return -EINVAL; } } @@ -147,6 +169,25 @@ void efi_char16_printk(efi_system_table_t *sys_table_arg, out->output_string(out, str); } +static struct screen_info *setup_graphics(efi_system_table_t *sys_table_arg) +{ + efi_guid_t gop_proto = EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID; + efi_status_t status; + unsigned long size; + void **gop_handle = NULL; + struct screen_info *si = NULL; + + size = 0; + status = efi_call_early(locate_handle, EFI_LOCATE_BY_PROTOCOL, + &gop_proto, NULL, &size, gop_handle); + if (status == EFI_BUFFER_TOO_SMALL) { + si = alloc_screen_info(sys_table_arg); + if (!si) + return NULL; + efi_setup_gop(sys_table_arg, si, &gop_proto, size); + } + return si; +} /* * This function handles the architcture specific differences between arm and @@ -185,6 +226,8 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table, efi_guid_t loaded_image_proto = LOADED_IMAGE_PROTOCOL_GUID; unsigned long reserve_addr = 0; unsigned long reserve_size = 0; + int secure_boot = 0; + struct screen_info *si; /* Check if we were booted by the EFI firmware */ if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) @@ -237,6 +280,8 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table, __nokaslr = true; } + si = setup_graphics(sys_table); + status = handle_kernel_image(sys_table, image_addr, &image_size, &reserve_addr, &reserve_size, @@ -250,12 +295,21 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table, if (status != EFI_SUCCESS) pr_efi_err(sys_table, "Failed to parse EFI cmdline options\n"); + secure_boot = efi_get_secureboot(sys_table); + if (secure_boot > 0) + pr_efi(sys_table, "UEFI Secure Boot is enabled.\n"); + + if (secure_boot < 0) { + pr_efi_err(sys_table, + "could not determine UEFI Secure Boot status.\n"); + } + /* * Unauthenticated device tree data is a security hazard, so * ignore 'dtb=' unless UEFI Secure Boot is disabled. */ - if (efi_secureboot_enabled(sys_table)) { - pr_efi(sys_table, "UEFI Secure Boot is enabled.\n"); + if (secure_boot != 0 && strstr(cmdline_ptr, "dtb=")) { + pr_efi(sys_table, "Ignoring DTB from command line.\n"); } else { status = handle_cmdline_files(sys_table, image, cmdline_ptr, "dtb=", @@ -309,6 +363,7 @@ fail_free_image: efi_free(sys_table, image_size, *image_addr); efi_free(sys_table, reserve_size, reserve_addr); fail_free_cmdline: + free_screen_info(sys_table, si); efi_free(sys_table, cmdline_size, (unsigned long)cmdline_ptr); fail: return EFI_ERROR; diff --git a/drivers/firmware/efi/libstub/arm32-stub.c b/drivers/firmware/efi/libstub/arm32-stub.c index 6f42be4d0084..e1f0b28e1dcb 100644 --- a/drivers/firmware/efi/libstub/arm32-stub.c +++ b/drivers/firmware/efi/libstub/arm32-stub.c @@ -26,6 +26,43 @@ efi_status_t check_platform_features(efi_system_table_t *sys_table_arg) return EFI_SUCCESS; } +static efi_guid_t screen_info_guid = LINUX_EFI_ARM_SCREEN_INFO_TABLE_GUID; + +struct screen_info *alloc_screen_info(efi_system_table_t *sys_table_arg) +{ + struct screen_info *si; + efi_status_t status; + + /* + * Unlike on arm64, where we can directly fill out the screen_info + * structure from the stub, we need to allocate a buffer to hold + * its contents while we hand over to the kernel proper from the + * decompressor. + */ + status = efi_call_early(allocate_pool, EFI_RUNTIME_SERVICES_DATA, + sizeof(*si), (void **)&si); + + if (status != EFI_SUCCESS) + return NULL; + + status = efi_call_early(install_configuration_table, + &screen_info_guid, si); + if (status == EFI_SUCCESS) + return si; + + efi_call_early(free_pool, si); + return NULL; +} + +void free_screen_info(efi_system_table_t *sys_table_arg, struct screen_info *si) +{ + if (!si) + return; + + efi_call_early(install_configuration_table, &screen_info_guid, NULL); + efi_call_early(free_pool, si); +} + efi_status_t handle_kernel_image(efi_system_table_t *sys_table, unsigned long *image_addr, unsigned long *image_size, diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c index 29ed2f9b218c..3bd127f95315 100644 --- a/drivers/firmware/efi/libstub/efi-stub-helper.c +++ b/drivers/firmware/efi/libstub/efi-stub-helper.c @@ -125,10 +125,12 @@ unsigned long get_dram_base(efi_system_table_t *sys_table_arg) map.map_end = map.map + map_size; - for_each_efi_memory_desc(&map, md) - if (md->attribute & EFI_MEMORY_WB) + for_each_efi_memory_desc_in_map(&map, md) { + if (md->attribute & EFI_MEMORY_WB) { if (membase > md->phys_addr) membase = md->phys_addr; + } + } efi_call_early(free_pool, map.map); diff --git a/drivers/firmware/efi/libstub/gop.c b/drivers/firmware/efi/libstub/gop.c new file mode 100644 index 000000000000..932742e4cf23 --- /dev/null +++ b/drivers/firmware/efi/libstub/gop.c @@ -0,0 +1,354 @@ +/* ----------------------------------------------------------------------- + * + * Copyright 2011 Intel Corporation; author Matt Fleming + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +#include <linux/efi.h> +#include <linux/screen_info.h> +#include <asm/efi.h> +#include <asm/setup.h> + +static void find_bits(unsigned long mask, u8 *pos, u8 *size) +{ + u8 first, len; + + first = 0; + len = 0; + + if (mask) { + while (!(mask & 0x1)) { + mask = mask >> 1; + first++; + } + + while (mask & 0x1) { + mask = mask >> 1; + len++; + } + } + + *pos = first; + *size = len; +} + +static void +setup_pixel_info(struct screen_info *si, u32 pixels_per_scan_line, + struct efi_pixel_bitmask pixel_info, int pixel_format) +{ + if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) { + si->lfb_depth = 32; + si->lfb_linelength = pixels_per_scan_line * 4; + si->red_size = 8; + si->red_pos = 0; + si->green_size = 8; + si->green_pos = 8; + si->blue_size = 8; + si->blue_pos = 16; + si->rsvd_size = 8; + si->rsvd_pos = 24; + } else if (pixel_format == PIXEL_BGR_RESERVED_8BIT_PER_COLOR) { + si->lfb_depth = 32; + si->lfb_linelength = pixels_per_scan_line * 4; + si->red_size = 8; + si->red_pos = 16; + si->green_size = 8; + si->green_pos = 8; + si->blue_size = 8; + si->blue_pos = 0; + si->rsvd_size = 8; + si->rsvd_pos = 24; + } else if (pixel_format == PIXEL_BIT_MASK) { + find_bits(pixel_info.red_mask, &si->red_pos, &si->red_size); + find_bits(pixel_info.green_mask, &si->green_pos, + &si->green_size); + find_bits(pixel_info.blue_mask, &si->blue_pos, &si->blue_size); + find_bits(pixel_info.reserved_mask, &si->rsvd_pos, + &si->rsvd_size); + si->lfb_depth = si->red_size + si->green_size + + si->blue_size + si->rsvd_size; + si->lfb_linelength = (pixels_per_scan_line * si->lfb_depth) / 8; + } else { + si->lfb_depth = 4; + si->lfb_linelength = si->lfb_width / 2; + si->red_size = 0; + si->red_pos = 0; + si->green_size = 0; + si->green_pos = 0; + si->blue_size = 0; + si->blue_pos = 0; + si->rsvd_size = 0; + si->rsvd_pos = 0; + } +} + +static efi_status_t +__gop_query32(efi_system_table_t *sys_table_arg, + struct efi_graphics_output_protocol_32 *gop32, + struct efi_graphics_output_mode_info **info, + unsigned long *size, u64 *fb_base) +{ + struct efi_graphics_output_protocol_mode_32 *mode; + efi_graphics_output_protocol_query_mode query_mode; + efi_status_t status; + unsigned long m; + + m = gop32->mode; + mode = (struct efi_graphics_output_protocol_mode_32 *)m; + query_mode = (void *)(unsigned long)gop32->query_mode; + + status = __efi_call_early(query_mode, (void *)gop32, mode->mode, size, + info); + if (status != EFI_SUCCESS) + return status; + + *fb_base = mode->frame_buffer_base; + return status; +} + +static efi_status_t +setup_gop32(efi_system_table_t *sys_table_arg, struct screen_info *si, + efi_guid_t *proto, unsigned long size, void **gop_handle) +{ + struct efi_graphics_output_protocol_32 *gop32, *first_gop; + unsigned long nr_gops; + u16 width, height; + u32 pixels_per_scan_line; + u32 ext_lfb_base; + u64 fb_base; + struct efi_pixel_bitmask pixel_info; + int pixel_format; + efi_status_t status = EFI_NOT_FOUND; + u32 *handles = (u32 *)(unsigned long)gop_handle; + int i; + + first_gop = NULL; + gop32 = NULL; + + nr_gops = size / sizeof(u32); + for (i = 0; i < nr_gops; i++) { + struct efi_graphics_output_mode_info *info = NULL; + efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID; + bool conout_found = false; + void *dummy = NULL; + efi_handle_t h = (efi_handle_t)(unsigned long)handles[i]; + u64 current_fb_base; + + status = efi_call_early(handle_protocol, h, + proto, (void **)&gop32); + if (status != EFI_SUCCESS) + continue; + + status = efi_call_early(handle_protocol, h, + &conout_proto, &dummy); + if (status == EFI_SUCCESS) + conout_found = true; + + status = __gop_query32(sys_table_arg, gop32, &info, &size, + ¤t_fb_base); + if (status == EFI_SUCCESS && (!first_gop || conout_found)) { + /* + * Systems that use the UEFI Console Splitter may + * provide multiple GOP devices, not all of which are + * backed by real hardware. The workaround is to search + * for a GOP implementing the ConOut protocol, and if + * one isn't found, to just fall back to the first GOP. + */ + width = info->horizontal_resolution; + height = info->vertical_resolution; + pixel_format = info->pixel_format; + pixel_info = info->pixel_information; + pixels_per_scan_line = info->pixels_per_scan_line; + fb_base = current_fb_base; + + /* + * Once we've found a GOP supporting ConOut, + * don't bother looking any further. + */ + first_gop = gop32; + if (conout_found) + break; + } + } + + /* Did we find any GOPs? */ + if (!first_gop) + goto out; + + /* EFI framebuffer */ + si->orig_video_isVGA = VIDEO_TYPE_EFI; + + si->lfb_width = width; + si->lfb_height = height; + si->lfb_base = fb_base; + + ext_lfb_base = (u64)(unsigned long)fb_base >> 32; + if (ext_lfb_base) { + si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE; + si->ext_lfb_base = ext_lfb_base; + } + + si->pages = 1; + + setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format); + + si->lfb_size = si->lfb_linelength * si->lfb_height; + + si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS; +out: + return status; +} + +static efi_status_t +__gop_query64(efi_system_table_t *sys_table_arg, + struct efi_graphics_output_protocol_64 *gop64, + struct efi_graphics_output_mode_info **info, + unsigned long *size, u64 *fb_base) +{ + struct efi_graphics_output_protocol_mode_64 *mode; + efi_graphics_output_protocol_query_mode query_mode; + efi_status_t status; + unsigned long m; + + m = gop64->mode; + mode = (struct efi_graphics_output_protocol_mode_64 *)m; + query_mode = (void *)(unsigned long)gop64->query_mode; + + status = __efi_call_early(query_mode, (void *)gop64, mode->mode, size, + info); + if (status != EFI_SUCCESS) + return status; + + *fb_base = mode->frame_buffer_base; + return status; +} + +static efi_status_t +setup_gop64(efi_system_table_t *sys_table_arg, struct screen_info *si, + efi_guid_t *proto, unsigned long size, void **gop_handle) +{ + struct efi_graphics_output_protocol_64 *gop64, *first_gop; + unsigned long nr_gops; + u16 width, height; + u32 pixels_per_scan_line; + u32 ext_lfb_base; + u64 fb_base; + struct efi_pixel_bitmask pixel_info; + int pixel_format; + efi_status_t status = EFI_NOT_FOUND; + u64 *handles = (u64 *)(unsigned long)gop_handle; + int i; + + first_gop = NULL; + gop64 = NULL; + + nr_gops = size / sizeof(u64); + for (i = 0; i < nr_gops; i++) { + struct efi_graphics_output_mode_info *info = NULL; + efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID; + bool conout_found = false; + void *dummy = NULL; + efi_handle_t h = (efi_handle_t)(unsigned long)handles[i]; + u64 current_fb_base; + + status = efi_call_early(handle_protocol, h, + proto, (void **)&gop64); + if (status != EFI_SUCCESS) + continue; + + status = efi_call_early(handle_protocol, h, + &conout_proto, &dummy); + if (status == EFI_SUCCESS) + conout_found = true; + + status = __gop_query64(sys_table_arg, gop64, &info, &size, + ¤t_fb_base); + if (status == EFI_SUCCESS && (!first_gop || conout_found)) { + /* + * Systems that use the UEFI Console Splitter may + * provide multiple GOP devices, not all of which are + * backed by real hardware. The workaround is to search + * for a GOP implementing the ConOut protocol, and if + * one isn't found, to just fall back to the first GOP. + */ + width = info->horizontal_resolution; + height = info->vertical_resolution; + pixel_format = info->pixel_format; + pixel_info = info->pixel_information; + pixels_per_scan_line = info->pixels_per_scan_line; + fb_base = current_fb_base; + + /* + * Once we've found a GOP supporting ConOut, + * don't bother looking any further. + */ + first_gop = gop64; + if (conout_found) + break; + } + } + + /* Did we find any GOPs? */ + if (!first_gop) + goto out; + + /* EFI framebuffer */ + si->orig_video_isVGA = VIDEO_TYPE_EFI; + + si->lfb_width = width; + si->lfb_height = height; + si->lfb_base = fb_base; + + ext_lfb_base = (u64)(unsigned long)fb_base >> 32; + if (ext_lfb_base) { + si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE; + si->ext_lfb_base = ext_lfb_base; + } + + si->pages = 1; + + setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format); + + si->lfb_size = si->lfb_linelength * si->lfb_height; + + si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS; +out: + return status; +} + +/* + * See if we have Graphics Output Protocol + */ +efi_status_t efi_setup_gop(efi_system_table_t *sys_table_arg, + struct screen_info *si, efi_guid_t *proto, + unsigned long size) +{ + efi_status_t status; + void **gop_handle = NULL; + + status = efi_call_early(allocate_pool, EFI_LOADER_DATA, + size, (void **)&gop_handle); + if (status != EFI_SUCCESS) + return status; + + status = efi_call_early(locate_handle, + EFI_LOCATE_BY_PROTOCOL, + proto, NULL, &size, gop_handle); + if (status != EFI_SUCCESS) + goto free_handle; + + if (efi_is_64bit()) { + status = setup_gop64(sys_table_arg, si, proto, size, + gop_handle); + } else { + status = setup_gop32(sys_table_arg, si, proto, size, + gop_handle); + } + +free_handle: + efi_call_early(free_pool, gop_handle); + return status; +} diff --git a/drivers/firmware/efi/memattr.c b/drivers/firmware/efi/memattr.c new file mode 100644 index 000000000000..236004b9a50d --- /dev/null +++ b/drivers/firmware/efi/memattr.c @@ -0,0 +1,182 @@ +/* + * Copyright (C) 2016 Linaro Ltd. <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define pr_fmt(fmt) "efi: memattr: " fmt + +#include <linux/efi.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/memblock.h> + +#include <asm/early_ioremap.h> + +static int __initdata tbl_size; + +/* + * Reserve the memory associated with the Memory Attributes configuration + * table, if it exists. + */ +int __init efi_memattr_init(void) +{ + efi_memory_attributes_table_t *tbl; + + if (efi.mem_attr_table == EFI_INVALID_TABLE_ADDR) + return 0; + + tbl = early_memremap(efi.mem_attr_table, sizeof(*tbl)); + if (!tbl) { + pr_err("Failed to map EFI Memory Attributes table @ 0x%lx\n", + efi.mem_attr_table); + return -ENOMEM; + } + + if (tbl->version > 1) { + pr_warn("Unexpected EFI Memory Attributes table version %d\n", + tbl->version); + goto unmap; + } + + tbl_size = sizeof(*tbl) + tbl->num_entries * tbl->desc_size; + memblock_reserve(efi.mem_attr_table, tbl_size); + +unmap: + early_memunmap(tbl, sizeof(*tbl)); + return 0; +} + +/* + * Returns a copy @out of the UEFI memory descriptor @in if it is covered + * entirely by a UEFI memory map entry with matching attributes. The virtual + * address of @out is set according to the matching entry that was found. + */ +static bool entry_is_valid(const efi_memory_desc_t *in, efi_memory_desc_t *out) +{ + u64 in_paddr = in->phys_addr; + u64 in_size = in->num_pages << EFI_PAGE_SHIFT; + efi_memory_desc_t *md; + + *out = *in; + + if (in->type != EFI_RUNTIME_SERVICES_CODE && + in->type != EFI_RUNTIME_SERVICES_DATA) { + pr_warn("Entry type should be RuntimeServiceCode/Data\n"); + return false; + } + + if (!(in->attribute & (EFI_MEMORY_RO | EFI_MEMORY_XP))) { + pr_warn("Entry attributes invalid: RO and XP bits both cleared\n"); + return false; + } + + if (PAGE_SIZE > EFI_PAGE_SIZE && + (!PAGE_ALIGNED(in->phys_addr) || + !PAGE_ALIGNED(in->num_pages << EFI_PAGE_SHIFT))) { + /* + * Since arm64 may execute with page sizes of up to 64 KB, the + * UEFI spec mandates that RuntimeServices memory regions must + * be 64 KB aligned. We need to validate this here since we will + * not be able to tighten permissions on such regions without + * affecting adjacent regions. + */ + pr_warn("Entry address region misaligned\n"); + return false; + } + + for_each_efi_memory_desc(md) { + u64 md_paddr = md->phys_addr; + u64 md_size = md->num_pages << EFI_PAGE_SHIFT; + + if (!(md->attribute & EFI_MEMORY_RUNTIME)) + continue; + if (md->virt_addr == 0) { + /* no virtual mapping has been installed by the stub */ + break; + } + + if (md_paddr > in_paddr || (in_paddr - md_paddr) >= md_size) + continue; + + /* + * This entry covers the start of @in, check whether + * it covers the end as well. + */ + if (md_paddr + md_size < in_paddr + in_size) { + pr_warn("Entry covers multiple EFI memory map regions\n"); + return false; + } + + if (md->type != in->type) { + pr_warn("Entry type deviates from EFI memory map region type\n"); + return false; + } + + out->virt_addr = in_paddr + (md->virt_addr - md_paddr); + + return true; + } + + pr_warn("No matching entry found in the EFI memory map\n"); + return false; +} + +/* + * To be called after the EFI page tables have been populated. If a memory + * attributes table is available, its contents will be used to update the + * mappings with tightened permissions as described by the table. + * This requires the UEFI memory map to have already been populated with + * virtual addresses. + */ +int __init efi_memattr_apply_permissions(struct mm_struct *mm, + efi_memattr_perm_setter fn) +{ + efi_memory_attributes_table_t *tbl; + int i, ret; + + if (tbl_size <= sizeof(*tbl)) + return 0; + + /* + * We need the EFI memory map to be setup so we can use it to + * lookup the virtual addresses of all entries in the of EFI + * Memory Attributes table. If it isn't available, this + * function should not be called. + */ + if (WARN_ON(!efi_enabled(EFI_MEMMAP))) + return 0; + + tbl = memremap(efi.mem_attr_table, tbl_size, MEMREMAP_WB); + if (!tbl) { + pr_err("Failed to map EFI Memory Attributes table @ 0x%lx\n", + efi.mem_attr_table); + return -ENOMEM; + } + + if (efi_enabled(EFI_DBG)) + pr_info("Processing EFI Memory Attributes table:\n"); + + for (i = ret = 0; ret == 0 && i < tbl->num_entries; i++) { + efi_memory_desc_t md; + unsigned long size; + bool valid; + char buf[64]; + + valid = entry_is_valid((void *)tbl->entry + i * tbl->desc_size, + &md); + size = md.num_pages << EFI_PAGE_SHIFT; + if (efi_enabled(EFI_DBG) || !valid) + pr_info("%s 0x%012llx-0x%012llx %s\n", + valid ? "" : "!", md.phys_addr, + md.phys_addr + size - 1, + efi_md_typeattr_format(buf, sizeof(buf), &md)); + + if (valid) + ret = fn(mm, &md); + } + memunmap(tbl); + return ret; +} diff --git a/drivers/firmware/efi/reboot.c b/drivers/firmware/efi/reboot.c index 9c59d1c795d1..62ead9b9d871 100644 --- a/drivers/firmware/efi/reboot.c +++ b/drivers/firmware/efi/reboot.c @@ -9,7 +9,8 @@ int efi_reboot_quirk_mode = -1; void efi_reboot(enum reboot_mode reboot_mode, const char *__unused) { - int efi_mode; + const char *str[] = { "cold", "warm", "shutdown", "platform" }; + int efi_mode, cap_reset_mode; if (!efi_enabled(EFI_RUNTIME_SERVICES)) return; @@ -30,6 +31,15 @@ void efi_reboot(enum reboot_mode reboot_mode, const char *__unused) if (efi_reboot_quirk_mode != -1) efi_mode = efi_reboot_quirk_mode; + if (efi_capsule_pending(&cap_reset_mode)) { + if (efi_mode != cap_reset_mode) + printk(KERN_CRIT "efi: %s reset requested but pending " + "capsule update requires %s reset... Performing " + "%s reset.\n", str[efi_mode], str[cap_reset_mode], + str[cap_reset_mode]); + efi_mode = cap_reset_mode; + } + efi.reset_system(efi_mode, EFI_SUCCESS, 0, NULL); } diff --git a/drivers/firmware/efi/runtime-wrappers.c b/drivers/firmware/efi/runtime-wrappers.c index de6953039af6..23bef6bb73ee 100644 --- a/drivers/firmware/efi/runtime-wrappers.c +++ b/drivers/firmware/efi/runtime-wrappers.c @@ -16,10 +16,70 @@ #include <linux/bug.h> #include <linux/efi.h> +#include <linux/irqflags.h> #include <linux/mutex.h> #include <linux/spinlock.h> +#include <linux/stringify.h> #include <asm/efi.h> +static void efi_call_virt_check_flags(unsigned long flags, const char *call) +{ + unsigned long cur_flags, mismatch; + + local_save_flags(cur_flags); + + mismatch = flags ^ cur_flags; + if (!WARN_ON_ONCE(mismatch & ARCH_EFI_IRQ_FLAGS_MASK)) + return; + + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_NOW_UNRELIABLE); + pr_err_ratelimited(FW_BUG "IRQ flags corrupted (0x%08lx=>0x%08lx) by EFI %s\n", + flags, cur_flags, call); + local_irq_restore(flags); +} + +/* + * Arch code can implement the following three template macros, avoiding + * reptition for the void/non-void return cases of {__,}efi_call_virt: + * + * * arch_efi_call_virt_setup + * + * Sets up the environment for the call (e.g. switching page tables, + * allowing kernel-mode use of floating point, if required). + * + * * arch_efi_call_virt + * + * Performs the call. The last expression in the macro must be the call + * itself, allowing the logic to be shared by the void and non-void + * cases. + * + * * arch_efi_call_virt_teardown + * + * Restores the usual kernel environment once the call has returned. + */ + +#define efi_call_virt(f, args...) \ +({ \ + efi_status_t __s; \ + unsigned long flags; \ + arch_efi_call_virt_setup(); \ + local_save_flags(flags); \ + __s = arch_efi_call_virt(f, args); \ + efi_call_virt_check_flags(flags, __stringify(f)); \ + arch_efi_call_virt_teardown(); \ + __s; \ +}) + +#define __efi_call_virt(f, args...) \ +({ \ + unsigned long flags; \ + arch_efi_call_virt_setup(); \ + local_save_flags(flags); \ + arch_efi_call_virt(f, args); \ + efi_call_virt_check_flags(flags, __stringify(f)); \ + arch_efi_call_virt_teardown(); \ +}) + /* * According to section 7.1 of the UEFI spec, Runtime Services are not fully * reentrant, and there are particular combinations of calls that need to be diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c index 34b741940494..d3b751383286 100644 --- a/drivers/firmware/efi/vars.c +++ b/drivers/firmware/efi/vars.c @@ -329,39 +329,6 @@ check_var_size_nonblocking(u32 attributes, unsigned long size) return fops->query_variable_store(attributes, size, true); } -static int efi_status_to_err(efi_status_t status) -{ - int err; - - switch (status) { - case EFI_SUCCESS: - err = 0; - break; - case EFI_INVALID_PARAMETER: - err = -EINVAL; - break; - case EFI_OUT_OF_RESOURCES: - err = -ENOSPC; - break; - case EFI_DEVICE_ERROR: - err = -EIO; - break; - case EFI_WRITE_PROTECTED: - err = -EROFS; - break; - case EFI_SECURITY_VIOLATION: - err = -EACCES; - break; - case EFI_NOT_FOUND: - err = -ENOENT; - break; - default: - err = -EINVAL; - } - - return err; -} - static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor, struct list_head *head) { @@ -452,8 +419,7 @@ static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid, * Returns 0 on success, or a kernel error code on failure. */ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), - void *data, bool atomic, bool duplicates, - struct list_head *head) + void *data, bool duplicates, struct list_head *head) { const struct efivar_operations *ops = __efivars->ops; unsigned long variable_name_size = 1024; @@ -483,7 +449,7 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), &vendor_guid); switch (status) { case EFI_SUCCESS: - if (!atomic) + if (duplicates) spin_unlock_irq(&__efivars->lock); variable_name_size = var_name_strnsize(variable_name, @@ -498,21 +464,19 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), * and may end up looping here forever. */ if (duplicates && - variable_is_present(variable_name, &vendor_guid, head)) { + variable_is_present(variable_name, &vendor_guid, + head)) { dup_variable_bug(variable_name, &vendor_guid, variable_name_size); - if (!atomic) - spin_lock_irq(&__efivars->lock); - status = EFI_NOT_FOUND; - break; + } else { + err = func(variable_name, vendor_guid, + variable_name_size, data); + if (err) + status = EFI_NOT_FOUND; } - err = func(variable_name, vendor_guid, variable_name_size, data); - if (err) - status = EFI_NOT_FOUND; - - if (!atomic) + if (duplicates) spin_lock_irq(&__efivars->lock); break; diff --git a/drivers/gpu/drm/amd/amdgpu/atombios_dp.c b/drivers/gpu/drm/amd/amdgpu/atombios_dp.c index bf731e9f643e..7f85c2c1d681 100644 --- a/drivers/gpu/drm/amd/amdgpu/atombios_dp.c +++ b/drivers/gpu/drm/amd/amdgpu/atombios_dp.c @@ -276,8 +276,8 @@ static int amdgpu_atombios_dp_get_dp_link_config(struct drm_connector *connector } } } else { - for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) { - for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) { + for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) { + for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) { max_pix_clock = (lane_num * link_rates[i] * 8) / bpp; if (max_pix_clock >= pix_clock) { *dp_lanes = lane_num; diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c index 6743ff7dccfa..059f7c39c582 100644 --- a/drivers/gpu/drm/drm_cache.c +++ b/drivers/gpu/drm/drm_cache.c @@ -72,7 +72,7 @@ drm_clflush_pages(struct page *pages[], unsigned long num_pages) { #if defined(CONFIG_X86) - if (cpu_has_clflush) { + if (static_cpu_has(X86_FEATURE_CLFLUSH)) { drm_cache_flush_clflush(pages, num_pages); return; } @@ -105,7 +105,7 @@ void drm_clflush_sg(struct sg_table *st) { #if defined(CONFIG_X86) - if (cpu_has_clflush) { + if (static_cpu_has(X86_FEATURE_CLFLUSH)) { struct sg_page_iter sg_iter; mb(); @@ -129,7 +129,7 @@ void drm_clflush_virt_range(void *addr, unsigned long length) { #if defined(CONFIG_X86) - if (cpu_has_clflush) { + if (static_cpu_has(X86_FEATURE_CLFLUSH)) { const int size = boot_cpu_data.x86_clflush_size; void *end = addr + length; addr = (void *)(((unsigned long)addr) & -size); diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index a0f1bd711b53..e3f4c725a1c6 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -2872,20 +2872,6 @@ static void intel_dp_info(struct seq_file *m, intel_panel_info(m, &intel_connector->panel); } -static void intel_dp_mst_info(struct seq_file *m, - struct intel_connector *intel_connector) -{ - struct intel_encoder *intel_encoder = intel_connector->encoder; - struct intel_dp_mst_encoder *intel_mst = - enc_to_mst(&intel_encoder->base); - struct intel_digital_port *intel_dig_port = intel_mst->primary; - struct intel_dp *intel_dp = &intel_dig_port->dp; - bool has_audio = drm_dp_mst_port_has_audio(&intel_dp->mst_mgr, - intel_connector->port); - - seq_printf(m, "\taudio support: %s\n", yesno(has_audio)); -} - static void intel_hdmi_info(struct seq_file *m, struct intel_connector *intel_connector) { @@ -2929,8 +2915,6 @@ static void intel_connector_info(struct seq_file *m, intel_hdmi_info(m, intel_connector); else if (intel_encoder->type == INTEL_OUTPUT_LVDS) intel_lvds_info(m, intel_connector); - else if (intel_encoder->type == INTEL_OUTPUT_DP_MST) - intel_dp_mst_info(m, intel_connector); } seq_printf(m, "\tmodes:\n"); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index dabc08987b5e..f2cb9a9539ee 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -1732,7 +1732,7 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data, if (args->flags & ~(I915_MMAP_WC)) return -EINVAL; - if (args->flags & I915_MMAP_WC && !cpu_has_pat) + if (args->flags & I915_MMAP_WC && !boot_cpu_has(X86_FEATURE_PAT)) return -ENODEV; obj = drm_gem_object_lookup(dev, file, args->handle); diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 1328bc5021b4..b845f468dd74 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -488,7 +488,7 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, ret = relocate_entry_cpu(obj, reloc, target_offset); else if (obj->map_and_fenceable) ret = relocate_entry_gtt(obj, reloc, target_offset); - else if (cpu_has_clflush) + else if (static_cpu_has(X86_FEATURE_CLFLUSH)) ret = relocate_entry_clflush(obj, reloc, target_offset); else { WARN_ONCE(1, "Impossible case in relocation handling\n"); diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index fffdac801d3b..363bd79dea2e 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -7444,6 +7444,8 @@ enum skl_disp_power_wells { #define TRANS_CLK_SEL_DISABLED (0x0<<29) #define TRANS_CLK_SEL_PORT(x) (((x)+1)<<29) +#define CDCLK_FREQ _MMIO(0x46200) + #define _TRANSA_MSA_MISC 0x60410 #define _TRANSB_MSA_MISC 0x61410 #define _TRANSC_MSA_MISC 0x62410 diff --git a/drivers/gpu/drm/i915/intel_audio.c b/drivers/gpu/drm/i915/intel_audio.c index 30f921421b0c..7d281b40064a 100644 --- a/drivers/gpu/drm/i915/intel_audio.c +++ b/drivers/gpu/drm/i915/intel_audio.c @@ -262,8 +262,7 @@ static void hsw_audio_codec_disable(struct intel_encoder *encoder) tmp |= AUD_CONFIG_N_PROG_ENABLE; tmp &= ~AUD_CONFIG_UPPER_N_MASK; tmp &= ~AUD_CONFIG_LOWER_N_MASK; - if (intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DISPLAYPORT) || - intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DP_MST)) + if (intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DISPLAYPORT)) tmp |= AUD_CONFIG_N_VALUE_INDEX; I915_WRITE(HSW_AUD_CFG(pipe), tmp); @@ -476,8 +475,7 @@ static void ilk_audio_codec_enable(struct drm_connector *connector, tmp &= ~AUD_CONFIG_N_VALUE_INDEX; tmp &= ~AUD_CONFIG_N_PROG_ENABLE; tmp &= ~AUD_CONFIG_PIXEL_CLOCK_HDMI_MASK; - if (intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DISPLAYPORT) || - intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DP_MST)) + if (intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DISPLAYPORT)) tmp |= AUD_CONFIG_N_VALUE_INDEX; else tmp |= audio_config_hdmi_pixel_clock(adjusted_mode); @@ -515,8 +513,7 @@ void intel_audio_codec_enable(struct intel_encoder *intel_encoder) /* ELD Conn_Type */ connector->eld[5] &= ~(3 << 2); - if (intel_pipe_has_type(crtc, INTEL_OUTPUT_DISPLAYPORT) || - intel_pipe_has_type(crtc, INTEL_OUTPUT_DP_MST)) + if (intel_pipe_has_type(crtc, INTEL_OUTPUT_DISPLAYPORT)) connector->eld[5] |= (1 << 2); connector->eld[6] = drm_av_sync_delay(connector, adjusted_mode) / 2; diff --git a/drivers/gpu/drm/i915/intel_crt.c b/drivers/gpu/drm/i915/intel_crt.c index 505fc5cf26f8..0364292367b1 100644 --- a/drivers/gpu/drm/i915/intel_crt.c +++ b/drivers/gpu/drm/i915/intel_crt.c @@ -257,8 +257,14 @@ static bool intel_crt_compute_config(struct intel_encoder *encoder, pipe_config->has_pch_encoder = true; /* LPT FDI RX only supports 8bpc. */ - if (HAS_PCH_LPT(dev)) + if (HAS_PCH_LPT(dev)) { + if (pipe_config->bw_constrained && pipe_config->pipe_bpp < 24) { + DRM_DEBUG_KMS("LPT only supports 24bpp\n"); + return false; + } + pipe_config->pipe_bpp = 24; + } /* FDI must always be 2.7 GHz */ if (HAS_DDI(dev)) { diff --git a/drivers/gpu/drm/i915/intel_ddi.c b/drivers/gpu/drm/i915/intel_ddi.c index 3b57bf06abe8..96ffcc541e17 100644 --- a/drivers/gpu/drm/i915/intel_ddi.c +++ b/drivers/gpu/drm/i915/intel_ddi.c @@ -3106,23 +3106,6 @@ void intel_ddi_fdi_disable(struct drm_crtc *crtc) I915_WRITE(FDI_RX_CTL(PIPE_A), val); } -bool intel_ddi_is_audio_enabled(struct drm_i915_private *dev_priv, - struct intel_crtc *intel_crtc) -{ - u32 temp; - - if (intel_display_power_get_if_enabled(dev_priv, POWER_DOMAIN_AUDIO)) { - temp = I915_READ(HSW_AUD_PIN_ELD_CP_VLD); - - intel_display_power_put(dev_priv, POWER_DOMAIN_AUDIO); - - if (temp & AUDIO_OUTPUT_ENABLE(intel_crtc->pipe)) - return true; - } - - return false; -} - void intel_ddi_get_config(struct intel_encoder *encoder, struct intel_crtc_state *pipe_config) { @@ -3183,8 +3166,11 @@ void intel_ddi_get_config(struct intel_encoder *encoder, break; } - pipe_config->has_audio = - intel_ddi_is_audio_enabled(dev_priv, intel_crtc); + if (intel_display_power_is_enabled(dev_priv, POWER_DOMAIN_AUDIO)) { + temp = I915_READ(HSW_AUD_PIN_ELD_CP_VLD); + if (temp & AUDIO_OUTPUT_ENABLE(intel_crtc->pipe)) + pipe_config->has_audio = true; + } if (encoder->type == INTEL_OUTPUT_EDP && dev_priv->vbt.edp_bpp && pipe_config->pipe_bpp > dev_priv->vbt.edp_bpp) { diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 182f84937345..0104a06d01fd 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -7988,9 +7988,6 @@ static void i9xx_get_pfit_config(struct intel_crtc *crtc, pipe_config->gmch_pfit.control = tmp; pipe_config->gmch_pfit.pgm_ratios = I915_READ(PFIT_PGM_RATIOS); - if (INTEL_INFO(dev)->gen < 5) - pipe_config->gmch_pfit.lvds_border_bits = - I915_READ(LVDS) & LVDS_BORDER_ENABLE; } static void vlv_crtc_clock_get(struct intel_crtc *crtc, @@ -9752,6 +9749,8 @@ static void broadwell_set_cdclk(struct drm_device *dev, int cdclk) sandybridge_pcode_write(dev_priv, HSW_PCODE_DE_WRITE_FREQ_REQ, data); mutex_unlock(&dev_priv->rps.hw_lock); + I915_WRITE(CDCLK_FREQ, DIV_ROUND_CLOSEST(cdclk, 1000) - 1); + intel_update_cdclk(dev); WARN(cdclk != dev_priv->cdclk_freq, diff --git a/drivers/gpu/drm/i915/intel_dp_mst.c b/drivers/gpu/drm/i915/intel_dp_mst.c index 937e77228466..2c999725b3d4 100644 --- a/drivers/gpu/drm/i915/intel_dp_mst.c +++ b/drivers/gpu/drm/i915/intel_dp_mst.c @@ -78,8 +78,6 @@ static bool intel_dp_mst_compute_config(struct intel_encoder *encoder, return false; } - if (drm_dp_mst_port_has_audio(&intel_dp->mst_mgr, found->port)) - pipe_config->has_audio = true; mst_pbn = drm_dp_calc_pbn_mode(adjusted_mode->crtc_clock, bpp); pipe_config->pbn = mst_pbn; @@ -104,11 +102,6 @@ static void intel_mst_disable_dp(struct intel_encoder *encoder) struct intel_dp_mst_encoder *intel_mst = enc_to_mst(&encoder->base); struct intel_digital_port *intel_dig_port = intel_mst->primary; struct intel_dp *intel_dp = &intel_dig_port->dp; - struct drm_device *dev = encoder->base.dev; - struct drm_i915_private *dev_priv = dev->dev_private; - struct drm_crtc *crtc = encoder->base.crtc; - struct intel_crtc *intel_crtc = to_intel_crtc(crtc); - int ret; DRM_DEBUG_KMS("%d\n", intel_dp->active_mst_links); @@ -119,10 +112,6 @@ static void intel_mst_disable_dp(struct intel_encoder *encoder) if (ret) { DRM_ERROR("failed to update payload %d\n", ret); } - if (intel_crtc->config->has_audio) { - intel_audio_codec_disable(encoder); - intel_display_power_put(dev_priv, POWER_DOMAIN_AUDIO); - } } static void intel_mst_post_disable_dp(struct intel_encoder *encoder) @@ -221,7 +210,6 @@ static void intel_mst_enable_dp(struct intel_encoder *encoder) struct intel_dp *intel_dp = &intel_dig_port->dp; struct drm_device *dev = intel_dig_port->base.base.dev; struct drm_i915_private *dev_priv = dev->dev_private; - struct intel_crtc *crtc = to_intel_crtc(encoder->base.crtc); enum port port = intel_dig_port->port; int ret; @@ -234,13 +222,6 @@ static void intel_mst_enable_dp(struct intel_encoder *encoder) ret = drm_dp_check_act_status(&intel_dp->mst_mgr); ret = drm_dp_update_payload_part2(&intel_dp->mst_mgr); - - if (crtc->config->has_audio) { - DRM_DEBUG_DRIVER("Enabling DP audio on pipe %c\n", - pipe_name(crtc->pipe)); - intel_display_power_get(dev_priv, POWER_DOMAIN_AUDIO); - intel_audio_codec_enable(encoder); - } } static bool intel_dp_mst_enc_get_hw_state(struct intel_encoder *encoder, @@ -266,9 +247,6 @@ static void intel_dp_mst_enc_get_config(struct intel_encoder *encoder, pipe_config->has_dp_encoder = true; - pipe_config->has_audio = - intel_ddi_is_audio_enabled(dev_priv, crtc); - temp = I915_READ(TRANS_DDI_FUNC_CTL(cpu_transcoder)); if (temp & TRANS_DDI_PHSYNC) flags |= DRM_MODE_FLAG_PHSYNC; diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h index 7d3af3a72abe..9d0770c23fde 100644 --- a/drivers/gpu/drm/i915/intel_drv.h +++ b/drivers/gpu/drm/i915/intel_drv.h @@ -1019,8 +1019,6 @@ void intel_ddi_set_pipe_settings(struct drm_crtc *crtc); void intel_ddi_prepare_link_retrain(struct intel_dp *intel_dp); bool intel_ddi_connector_get_hw_state(struct intel_connector *intel_connector); void intel_ddi_fdi_disable(struct drm_crtc *crtc); -bool intel_ddi_is_audio_enabled(struct drm_i915_private *dev_priv, - struct intel_crtc *intel_crtc); void intel_ddi_get_config(struct intel_encoder *encoder, struct intel_crtc_state *pipe_config); struct intel_encoder * diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c index cd9fe609aefb..10dc3517b63b 100644 --- a/drivers/gpu/drm/i915/intel_lvds.c +++ b/drivers/gpu/drm/i915/intel_lvds.c @@ -123,6 +123,10 @@ static void intel_lvds_get_config(struct intel_encoder *encoder, pipe_config->base.adjusted_mode.flags |= flags; + if (INTEL_INFO(dev)->gen < 5) + pipe_config->gmch_pfit.lvds_border_bits = + tmp & LVDS_BORDER_ENABLE; + /* gen2/3 store dither state in pfit control, needs to match */ if (INTEL_INFO(dev)->gen < 4) { tmp = I915_READ(PFIT_CONTROL); diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c index 8ed3cf34f82d..3425d8e737b3 100644 --- a/drivers/gpu/drm/i915/intel_pm.c +++ b/drivers/gpu/drm/i915/intel_pm.c @@ -6646,6 +6646,12 @@ static void broadwell_init_clock_gating(struct drm_device *dev) misccpctl = I915_READ(GEN7_MISCCPCTL); I915_WRITE(GEN7_MISCCPCTL, misccpctl & ~GEN7_DOP_CLOCK_GATE_ENABLE); I915_WRITE(GEN8_L3SQCREG1, BDW_WA_L3SQCREG1_DEFAULT); + /* + * Wait at least 100 clocks before re-enabling clock gating. See + * the definition of L3SQCREG1 in BSpec. + */ + POSTING_READ(GEN8_L3SQCREG1); + udelay(1); I915_WRITE(GEN7_MISCCPCTL, misccpctl); /* diff --git a/drivers/gpu/drm/radeon/atombios_crtc.c b/drivers/gpu/drm/radeon/atombios_crtc.c index b80b08f71cb4..532127c55de6 100644 --- a/drivers/gpu/drm/radeon/atombios_crtc.c +++ b/drivers/gpu/drm/radeon/atombios_crtc.c @@ -1742,6 +1742,7 @@ static u32 radeon_get_pll_use_mask(struct drm_crtc *crtc) static int radeon_get_shared_dp_ppll(struct drm_crtc *crtc) { struct drm_device *dev = crtc->dev; + struct radeon_device *rdev = dev->dev_private; struct drm_crtc *test_crtc; struct radeon_crtc *test_radeon_crtc; @@ -1751,6 +1752,10 @@ static int radeon_get_shared_dp_ppll(struct drm_crtc *crtc) test_radeon_crtc = to_radeon_crtc(test_crtc); if (test_radeon_crtc->encoder && ENCODER_MODE_IS_DP(atombios_get_encoder_mode(test_radeon_crtc->encoder))) { + /* PPLL2 is exclusive to UNIPHYA on DCE61 */ + if (ASIC_IS_DCE61(rdev) && !ASIC_IS_DCE8(rdev) && + test_radeon_crtc->pll_id == ATOM_PPLL2) + continue; /* for DP use the same PLL for all */ if (test_radeon_crtc->pll_id != ATOM_PPLL_INVALID) return test_radeon_crtc->pll_id; @@ -1772,6 +1777,7 @@ static int radeon_get_shared_nondp_ppll(struct drm_crtc *crtc) { struct radeon_crtc *radeon_crtc = to_radeon_crtc(crtc); struct drm_device *dev = crtc->dev; + struct radeon_device *rdev = dev->dev_private; struct drm_crtc *test_crtc; struct radeon_crtc *test_radeon_crtc; u32 adjusted_clock, test_adjusted_clock; @@ -1787,6 +1793,10 @@ static int radeon_get_shared_nondp_ppll(struct drm_crtc *crtc) test_radeon_crtc = to_radeon_crtc(test_crtc); if (test_radeon_crtc->encoder && !ENCODER_MODE_IS_DP(atombios_get_encoder_mode(test_radeon_crtc->encoder))) { + /* PPLL2 is exclusive to UNIPHYA on DCE61 */ + if (ASIC_IS_DCE61(rdev) && !ASIC_IS_DCE8(rdev) && + test_radeon_crtc->pll_id == ATOM_PPLL2) + continue; /* check if we are already driving this connector with another crtc */ if (test_radeon_crtc->connector == radeon_crtc->connector) { /* if we are, return that pll */ diff --git a/drivers/gpu/drm/radeon/atombios_dp.c b/drivers/gpu/drm/radeon/atombios_dp.c index afa9db1dc0e3..cead089a9e7d 100644 --- a/drivers/gpu/drm/radeon/atombios_dp.c +++ b/drivers/gpu/drm/radeon/atombios_dp.c @@ -326,8 +326,8 @@ int radeon_dp_get_dp_link_config(struct drm_connector *connector, } } } else { - for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) { - for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) { + for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) { + for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) { max_pix_clock = (lane_num * link_rates[i] * 8) / bpp; if (max_pix_clock >= pix_clock) { *dp_lanes = lane_num; diff --git a/drivers/gpu/drm/radeon/radeon_dp_auxch.c b/drivers/gpu/drm/radeon/radeon_dp_auxch.c index 3b0c229d7dcd..db64e0062689 100644 --- a/drivers/gpu/drm/radeon/radeon_dp_auxch.c +++ b/drivers/gpu/drm/radeon/radeon_dp_auxch.c @@ -105,7 +105,7 @@ radeon_dp_aux_transfer_native(struct drm_dp_aux *aux, struct drm_dp_aux_msg *msg tmp &= AUX_HPD_SEL(0x7); tmp |= AUX_HPD_SEL(chan->rec.hpd); - tmp |= AUX_EN | AUX_LS_READ_EN; + tmp |= AUX_EN | AUX_LS_READ_EN | AUX_HPD_DISCON(0x1); WREG32(AUX_CONTROL + aux_offset[instance], tmp); diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c index 6f8b084e13d0..3d8ff09eba57 100644 --- a/drivers/input/joystick/analog.c +++ b/drivers/input/joystick/analog.c @@ -143,9 +143,9 @@ struct analog_port { #include <linux/i8253.h> -#define GET_TIME(x) do { if (cpu_has_tsc) x = (unsigned int)rdtsc(); else x = get_time_pit(); } while (0) -#define DELTA(x,y) (cpu_has_tsc ? ((y) - (x)) : ((x) - (y) + ((x) < (y) ? PIT_TICK_RATE / HZ : 0))) -#define TIME_NAME (cpu_has_tsc?"TSC":"PIT") +#define GET_TIME(x) do { if (boot_cpu_has(X86_FEATURE_TSC)) x = (unsigned int)rdtsc(); else x = get_time_pit(); } while (0) +#define DELTA(x,y) (boot_cpu_has(X86_FEATURE_TSC) ? ((y) - (x)) : ((x) - (y) + ((x) < (y) ? PIT_TICK_RATE / HZ : 0))) +#define TIME_NAME (boot_cpu_has(X86_FEATURE_TSC)?"TSC":"PIT") static unsigned int get_time_pit(void) { unsigned long flags; diff --git a/drivers/input/misc/max8997_haptic.c b/drivers/input/misc/max8997_haptic.c index a806ba3818f7..8d6326d7e7be 100644 --- a/drivers/input/misc/max8997_haptic.c +++ b/drivers/input/misc/max8997_haptic.c @@ -255,12 +255,14 @@ static int max8997_haptic_probe(struct platform_device *pdev) struct max8997_dev *iodev = dev_get_drvdata(pdev->dev.parent); const struct max8997_platform_data *pdata = dev_get_platdata(iodev->dev); - const struct max8997_haptic_platform_data *haptic_pdata = - pdata->haptic_pdata; + const struct max8997_haptic_platform_data *haptic_pdata = NULL; struct max8997_haptic *chip; struct input_dev *input_dev; int error; + if (pdata) + haptic_pdata = pdata->haptic_pdata; + if (!haptic_pdata) { dev_err(&pdev->dev, "no haptic platform data\n"); return -EINVAL; diff --git a/drivers/input/misc/twl6040-vibra.c b/drivers/input/misc/twl6040-vibra.c index df3581f60628..42de34b92996 100644 --- a/drivers/input/misc/twl6040-vibra.c +++ b/drivers/input/misc/twl6040-vibra.c @@ -257,6 +257,7 @@ static int twl6040_vibra_probe(struct platform_device *pdev) int vddvibr_uV = 0; int error; + of_node_get(twl6040_core_dev->of_node); twl6040_core_node = of_find_node_by_name(twl6040_core_dev->of_node, "vibra"); if (!twl6040_core_node) { diff --git a/drivers/input/mouse/byd.c b/drivers/input/mouse/byd.c index fdc243ca93ed..e583f8b50454 100644 --- a/drivers/input/mouse/byd.c +++ b/drivers/input/mouse/byd.c @@ -2,6 +2,10 @@ * BYD TouchPad PS/2 mouse driver * * Copyright (C) 2015 Chris Diamand <chris@diamand.org> + * Copyright (C) 2015 Richard Pospesel + * Copyright (C) 2015 Tai Chi Minh Ralph Eastwood + * Copyright (C) 2015 Martin Wimpress + * Copyright (C) 2015 Jay Kuri * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published by diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c index 8adaaeae3268..49721b4e1975 100644 --- a/drivers/iommu/irq_remapping.c +++ b/drivers/iommu/irq_remapping.c @@ -36,7 +36,7 @@ static void irq_remapping_disable_io_apic(void) * As this gets called during crash dump, keep this simple for * now. */ - if (cpu_has_apic || apic_from_smp_config()) + if (boot_cpu_has(X86_FEATURE_APIC) || apic_from_smp_config()) disconnect_bsp_APIC(0); } diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index adc162c7040d..6e9042e3d2a9 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -603,7 +603,7 @@ void __init lguest_arch_host_init(void) * doing this. */ get_online_cpus(); - if (cpu_has_pge) { /* We have a broader idea of "global". */ + if (boot_cpu_has(X86_FEATURE_PGE)) { /* We have a broader idea of "global". */ /* Remember that this was originally set (for cleanup). */ cpu_had_pge = 1; /* diff --git a/drivers/media/v4l2-core/videobuf2-v4l2.c b/drivers/media/v4l2-core/videobuf2-v4l2.c index 7f366f1b0377..0b1b8c7b6ce5 100644 --- a/drivers/media/v4l2-core/videobuf2-v4l2.c +++ b/drivers/media/v4l2-core/videobuf2-v4l2.c @@ -74,11 +74,6 @@ static int __verify_planes_array(struct vb2_buffer *vb, const struct v4l2_buffer return 0; } -static int __verify_planes_array_core(struct vb2_buffer *vb, const void *pb) -{ - return __verify_planes_array(vb, pb); -} - /** * __verify_length() - Verify that the bytesused value for each plane fits in * the plane length and that the data offset doesn't exceed the bytesused value. @@ -442,7 +437,6 @@ static int __fill_vb2_buffer(struct vb2_buffer *vb, } static const struct vb2_buf_ops v4l2_buf_ops = { - .verify_planes_array = __verify_planes_array_core, .fill_user_buffer = __fill_v4l2_buffer, .fill_vb2_buffer = __fill_vb2_buffer, .copy_timestamp = __copy_timestamp, diff --git a/drivers/misc/sgi-gru/grukservices.c b/drivers/misc/sgi-gru/grukservices.c index 967b9dd24fe9..030769018461 100644 --- a/drivers/misc/sgi-gru/grukservices.c +++ b/drivers/misc/sgi-gru/grukservices.c @@ -718,8 +718,8 @@ cberr: static int send_message_put_nacked(void *cb, struct gru_message_queue_desc *mqd, void *mesg, int lines) { - unsigned long m, *val = mesg, gpa, save; - int ret; + unsigned long m; + int ret, loops = 200; /* experimentally determined */ m = mqd->mq_gpa + (gru_get_amo_value_head(cb) << 6); if (lines == 2) { @@ -735,22 +735,28 @@ static int send_message_put_nacked(void *cb, struct gru_message_queue_desc *mqd, return MQE_OK; /* - * Send a cross-partition interrupt to the SSI that contains the target - * message queue. Normally, the interrupt is automatically delivered by - * hardware but some error conditions require explicit delivery. - * Use the GRU to deliver the interrupt. Otherwise partition failures + * Send a noop message in order to deliver a cross-partition interrupt + * to the SSI that contains the target message queue. Normally, the + * interrupt is automatically delivered by hardware following mesq + * operations, but some error conditions require explicit delivery. + * The noop message will trigger delivery. Otherwise partition failures * could cause unrecovered errors. */ - gpa = uv_global_gru_mmr_address(mqd->interrupt_pnode, UVH_IPI_INT); - save = *val; - *val = uv_hub_ipi_value(mqd->interrupt_apicid, mqd->interrupt_vector, - dest_Fixed); - gru_vstore_phys(cb, gpa, gru_get_tri(mesg), IAA_REGISTER, IMA); - ret = gru_wait(cb); - *val = save; - if (ret != CBS_IDLE) - return MQE_UNEXPECTED_CB_ERR; - return MQE_OK; + do { + ret = send_noop_message(cb, mqd, mesg); + } while ((ret == MQIE_AGAIN || ret == MQE_CONGESTION) && (loops-- > 0)); + + if (ret == MQIE_AGAIN || ret == MQE_CONGESTION) { + /* + * Don't indicate to the app to resend the message, as it's + * already been successfully sent. We simply send an OK + * (rather than fail the send with MQE_UNEXPECTED_CB_ERR), + * assuming that the other side is receiving enough + * interrupts to get this message processed anyway. + */ + ret = MQE_OK; + } + return ret; } /* diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c index b212488606da..11be8044e0d7 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c @@ -43,6 +43,7 @@ static void xgene_cle_idt_to_hw(u32 dstqid, u32 fpsel, static void xgene_cle_dbptr_to_hw(struct xgene_enet_pdata *pdata, struct xgene_cle_dbptr *dbptr, u32 *buf) { + buf[0] = SET_VAL(CLE_DROP, dbptr->drop); buf[4] = SET_VAL(CLE_FPSEL, dbptr->fpsel) | SET_VAL(CLE_DSTQIDL, dbptr->dstqid); @@ -412,7 +413,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata *pdata) .branch = { { /* IPV4 */ - .valid = 0, + .valid = 1, .next_packet_pointer = 22, .jump_bw = JMP_FW, .jump_rel = JMP_ABS, @@ -420,7 +421,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata *pdata) .next_node = PKT_PROT_NODE, .next_branch = 0, .data = 0x8, - .mask = 0xffff + .mask = 0x0 }, { .valid = 0, @@ -456,7 +457,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata *pdata) .next_node = RSS_IPV4_TCP_NODE, .next_branch = 0, .data = 0x0600, - .mask = 0xffff + .mask = 0x00ff }, { /* UDP */ @@ -468,7 +469,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata *pdata) .next_node = RSS_IPV4_UDP_NODE, .next_branch = 0, .data = 0x1100, - .mask = 0xffff + .mask = 0x00ff }, { .valid = 0, @@ -642,7 +643,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata *pdata) { /* TCP DST Port */ .valid = 0, - .next_packet_pointer = 256, + .next_packet_pointer = 258, .jump_bw = JMP_FW, .jump_rel = JMP_ABS, .operation = EQT, diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.h b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.h index 29a17abdd828..3bf90683240e 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.h +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.h @@ -83,6 +83,8 @@ #define CLE_TYPE_POS 0 #define CLE_TYPE_LEN 2 +#define CLE_DROP_POS 28 +#define CLE_DROP_LEN 1 #define CLE_DSTQIDL_POS 25 #define CLE_DSTQIDL_LEN 7 #define CLE_DSTQIDH_POS 0 diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c index 39e081a70f5b..513d2a62ee6d 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c @@ -219,27 +219,30 @@ void xgene_enet_parse_error(struct xgene_enet_desc_ring *ring, struct xgene_enet_pdata *pdata, enum xgene_enet_err_code status) { - struct rtnl_link_stats64 *stats = &pdata->stats; - switch (status) { case INGRESS_CRC: - stats->rx_crc_errors++; + ring->rx_crc_errors++; + ring->rx_dropped++; break; case INGRESS_CHECKSUM: case INGRESS_CHECKSUM_COMPUTE: - stats->rx_errors++; + ring->rx_errors++; + ring->rx_dropped++; break; case INGRESS_TRUNC_FRAME: - stats->rx_frame_errors++; + ring->rx_frame_errors++; + ring->rx_dropped++; break; case INGRESS_PKT_LEN: - stats->rx_length_errors++; + ring->rx_length_errors++; + ring->rx_dropped++; break; case INGRESS_PKT_UNDER: - stats->rx_frame_errors++; + ring->rx_frame_errors++; + ring->rx_dropped++; break; case INGRESS_FIFO_OVERRUN: - stats->rx_fifo_errors++; + ring->rx_fifo_errors++; break; default: break; diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h index ba7da98af2ef..45220be3122f 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h @@ -86,7 +86,7 @@ enum xgene_enet_rm { #define RINGADDRL_POS 5 #define RINGADDRL_LEN 27 #define RINGADDRH_POS 0 -#define RINGADDRH_LEN 6 +#define RINGADDRH_LEN 7 #define RINGSIZE_POS 23 #define RINGSIZE_LEN 3 #define RINGTYPE_POS 19 @@ -94,9 +94,9 @@ enum xgene_enet_rm { #define RINGMODE_POS 20 #define RINGMODE_LEN 3 #define RECOMTIMEOUTL_POS 28 -#define RECOMTIMEOUTL_LEN 3 +#define RECOMTIMEOUTL_LEN 4 #define RECOMTIMEOUTH_POS 0 -#define RECOMTIMEOUTH_LEN 2 +#define RECOMTIMEOUTH_LEN 3 #define NUMMSGSINQ_POS 1 #define NUMMSGSINQ_LEN 16 #define ACCEPTLERR BIT(19) @@ -201,6 +201,8 @@ enum xgene_enet_rm { #define USERINFO_LEN 32 #define FPQNUM_POS 32 #define FPQNUM_LEN 12 +#define ELERR_POS 46 +#define ELERR_LEN 2 #define NV_POS 50 #define NV_LEN 1 #define LL_POS 51 diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c index 99d7e580e166..fd200883d228 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c @@ -443,8 +443,8 @@ static netdev_tx_t xgene_enet_start_xmit(struct sk_buff *skb, skb_tx_timestamp(skb); - pdata->stats.tx_packets++; - pdata->stats.tx_bytes += skb->len; + tx_ring->tx_packets++; + tx_ring->tx_bytes += skb->len; pdata->ring_ops->wr_cmd(tx_ring, count); return NETDEV_TX_OK; @@ -483,12 +483,12 @@ static int xgene_enet_rx_frame(struct xgene_enet_desc_ring *rx_ring, skb = buf_pool->rx_skb[skb_index]; /* checking for error */ - status = GET_VAL(LERR, le64_to_cpu(raw_desc->m0)); + status = (GET_VAL(ELERR, le64_to_cpu(raw_desc->m0)) << LERR_LEN) || + GET_VAL(LERR, le64_to_cpu(raw_desc->m0)); if (unlikely(status > 2)) { dev_kfree_skb_any(skb); xgene_enet_parse_error(rx_ring, netdev_priv(rx_ring->ndev), status); - pdata->stats.rx_dropped++; ret = -EIO; goto out; } @@ -506,8 +506,8 @@ static int xgene_enet_rx_frame(struct xgene_enet_desc_ring *rx_ring, xgene_enet_skip_csum(skb); } - pdata->stats.rx_packets++; - pdata->stats.rx_bytes += datalen; + rx_ring->rx_packets++; + rx_ring->rx_bytes += datalen; napi_gro_receive(&rx_ring->napi, skb); out: if (--rx_ring->nbufpool == 0) { @@ -630,7 +630,7 @@ static int xgene_enet_register_irq(struct net_device *ndev) ring = pdata->rx_ring[i]; irq_set_status_flags(ring->irq, IRQ_DISABLE_UNLAZY); ret = devm_request_irq(dev, ring->irq, xgene_enet_rx_irq, - IRQF_SHARED, ring->irq_name, ring); + 0, ring->irq_name, ring); if (ret) { netdev_err(ndev, "Failed to request irq %s\n", ring->irq_name); @@ -641,7 +641,7 @@ static int xgene_enet_register_irq(struct net_device *ndev) ring = pdata->tx_ring[i]->cp_ring; irq_set_status_flags(ring->irq, IRQ_DISABLE_UNLAZY); ret = devm_request_irq(dev, ring->irq, xgene_enet_rx_irq, - IRQF_SHARED, ring->irq_name, ring); + 0, ring->irq_name, ring); if (ret) { netdev_err(ndev, "Failed to request irq %s\n", ring->irq_name); @@ -1114,12 +1114,31 @@ static struct rtnl_link_stats64 *xgene_enet_get_stats64( { struct xgene_enet_pdata *pdata = netdev_priv(ndev); struct rtnl_link_stats64 *stats = &pdata->stats; + struct xgene_enet_desc_ring *ring; + int i; - stats->rx_errors += stats->rx_length_errors + - stats->rx_crc_errors + - stats->rx_frame_errors + - stats->rx_fifo_errors; - memcpy(storage, &pdata->stats, sizeof(struct rtnl_link_stats64)); + memset(stats, 0, sizeof(struct rtnl_link_stats64)); + for (i = 0; i < pdata->txq_cnt; i++) { + ring = pdata->tx_ring[i]; + if (ring) { + stats->tx_packets += ring->tx_packets; + stats->tx_bytes += ring->tx_bytes; + } + } + + for (i = 0; i < pdata->rxq_cnt; i++) { + ring = pdata->rx_ring[i]; + if (ring) { + stats->rx_packets += ring->rx_packets; + stats->rx_bytes += ring->rx_bytes; + stats->rx_errors += ring->rx_length_errors + + ring->rx_crc_errors + + ring->rx_frame_errors + + ring->rx_fifo_errors; + stats->rx_dropped += ring->rx_dropped; + } + } + memcpy(storage, stats, sizeof(struct rtnl_link_stats64)); return storage; } @@ -1234,6 +1253,13 @@ static int xgene_enet_get_irqs(struct xgene_enet_pdata *pdata) for (i = 0; i < max_irqs; i++) { ret = platform_get_irq(pdev, i); if (ret <= 0) { + if (pdata->phy_mode == PHY_INTERFACE_MODE_XGMII) { + max_irqs = i; + pdata->rxq_cnt = max_irqs / 2; + pdata->txq_cnt = max_irqs / 2; + pdata->cq_cnt = max_irqs / 2; + break; + } dev_err(dev, "Unable to get ENET IRQ\n"); ret = ret ? : -ENXIO; return ret; @@ -1437,19 +1463,28 @@ static void xgene_enet_setup_ops(struct xgene_enet_pdata *pdata) pdata->port_ops = &xgene_xgport_ops; pdata->cle_ops = &xgene_cle3in_ops; pdata->rm = RM0; - pdata->rxq_cnt = XGENE_NUM_RX_RING; - pdata->txq_cnt = XGENE_NUM_TX_RING; - pdata->cq_cnt = XGENE_NUM_TXC_RING; + if (!pdata->rxq_cnt) { + pdata->rxq_cnt = XGENE_NUM_RX_RING; + pdata->txq_cnt = XGENE_NUM_TX_RING; + pdata->cq_cnt = XGENE_NUM_TXC_RING; + } break; } if (pdata->enet_id == XGENE_ENET1) { switch (pdata->port_id) { case 0: - pdata->cpu_bufnum = START_CPU_BUFNUM_0; - pdata->eth_bufnum = START_ETH_BUFNUM_0; - pdata->bp_bufnum = START_BP_BUFNUM_0; - pdata->ring_num = START_RING_NUM_0; + if (pdata->phy_mode == PHY_INTERFACE_MODE_XGMII) { + pdata->cpu_bufnum = X2_START_CPU_BUFNUM_0; + pdata->eth_bufnum = X2_START_ETH_BUFNUM_0; + pdata->bp_bufnum = X2_START_BP_BUFNUM_0; + pdata->ring_num = START_RING_NUM_0; + } else { + pdata->cpu_bufnum = START_CPU_BUFNUM_0; + pdata->eth_bufnum = START_ETH_BUFNUM_0; + pdata->bp_bufnum = START_BP_BUFNUM_0; + pdata->ring_num = START_RING_NUM_0; + } break; case 1: if (pdata->phy_mode == PHY_INTERFACE_MODE_XGMII) { diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.h b/drivers/net/ethernet/apm/xgene/xgene_enet_main.h index 175d18890c7a..9d9cf445148c 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.h +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.h @@ -49,10 +49,10 @@ #define XGENE_ENET_MSS 1448 #define XGENE_MIN_ENET_FRAME_SIZE 60 -#define XGENE_MAX_ENET_IRQ 8 -#define XGENE_NUM_RX_RING 4 -#define XGENE_NUM_TX_RING 4 -#define XGENE_NUM_TXC_RING 4 +#define XGENE_MAX_ENET_IRQ 16 +#define XGENE_NUM_RX_RING 8 +#define XGENE_NUM_TX_RING 8 +#define XGENE_NUM_TXC_RING 8 #define START_CPU_BUFNUM_0 0 #define START_ETH_BUFNUM_0 2 @@ -121,6 +121,16 @@ struct xgene_enet_desc_ring { struct xgene_enet_raw_desc16 *raw_desc16; }; __le64 *exp_bufs; + u64 tx_packets; + u64 tx_bytes; + u64 rx_packets; + u64 rx_bytes; + u64 rx_dropped; + u64 rx_errors; + u64 rx_length_errors; + u64 rx_crc_errors; + u64 rx_frame_errors; + u64 rx_fifo_errors; }; struct xgene_mac_ops { diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.h b/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.h index 29a71b4dcc44..002df5a6756e 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.h +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.h @@ -33,7 +33,7 @@ #define LINK_STATUS BIT(2) #define LINK_UP BIT(15) #define MPA_IDLE_WITH_QMI_EMPTY BIT(12) -#define SG_RX_DV_GATE_REG_0_ADDR 0x0dfc +#define SG_RX_DV_GATE_REG_0_ADDR 0x05fc extern const struct xgene_mac_ops xgene_sgmac_ops; extern const struct xgene_port_ops xgene_sgport_ops; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 9d4e8e113fe1..c39a7f5c6a01 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -813,6 +813,46 @@ static inline struct sk_buff *bnxt_copy_skb(struct bnxt_napi *bnapi, u8 *data, return skb; } +static int bnxt_discard_rx(struct bnxt *bp, struct bnxt_napi *bnapi, + u32 *raw_cons, void *cmp) +{ + struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring; + struct rx_cmp *rxcmp = cmp; + u32 tmp_raw_cons = *raw_cons; + u8 cmp_type, agg_bufs = 0; + + cmp_type = RX_CMP_TYPE(rxcmp); + + if (cmp_type == CMP_TYPE_RX_L2_CMP) { + agg_bufs = (le32_to_cpu(rxcmp->rx_cmp_misc_v1) & + RX_CMP_AGG_BUFS) >> + RX_CMP_AGG_BUFS_SHIFT; + } else if (cmp_type == CMP_TYPE_RX_L2_TPA_END_CMP) { + struct rx_tpa_end_cmp *tpa_end = cmp; + + agg_bufs = (le32_to_cpu(tpa_end->rx_tpa_end_cmp_misc_v1) & + RX_TPA_END_CMP_AGG_BUFS) >> + RX_TPA_END_CMP_AGG_BUFS_SHIFT; + } + + if (agg_bufs) { + if (!bnxt_agg_bufs_valid(bp, cpr, agg_bufs, &tmp_raw_cons)) + return -EBUSY; + } + *raw_cons = tmp_raw_cons; + return 0; +} + +static void bnxt_sched_reset(struct bnxt *bp, struct bnxt_rx_ring_info *rxr) +{ + if (!rxr->bnapi->in_reset) { + rxr->bnapi->in_reset = true; + set_bit(BNXT_RESET_TASK_SP_EVENT, &bp->sp_event); + schedule_work(&bp->sp_task); + } + rxr->rx_next_cons = 0xffff; +} + static void bnxt_tpa_start(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, struct rx_tpa_start_cmp *tpa_start, struct rx_tpa_start_cmp_ext *tpa_start1) @@ -830,6 +870,11 @@ static void bnxt_tpa_start(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, prod_rx_buf = &rxr->rx_buf_ring[prod]; tpa_info = &rxr->rx_tpa[agg_id]; + if (unlikely(cons != rxr->rx_next_cons)) { + bnxt_sched_reset(bp, rxr); + return; + } + prod_rx_buf->data = tpa_info->data; mapping = tpa_info->mapping; @@ -867,6 +912,7 @@ static void bnxt_tpa_start(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, rxr->rx_prod = NEXT_RX(prod); cons = NEXT_RX(cons); + rxr->rx_next_cons = NEXT_RX(cons); cons_rx_buf = &rxr->rx_buf_ring[cons]; bnxt_reuse_rx_data(rxr, cons, cons_rx_buf->data); @@ -980,6 +1026,14 @@ static inline struct sk_buff *bnxt_tpa_end(struct bnxt *bp, dma_addr_t mapping; struct sk_buff *skb; + if (unlikely(bnapi->in_reset)) { + int rc = bnxt_discard_rx(bp, bnapi, raw_cons, tpa_end); + + if (rc < 0) + return ERR_PTR(-EBUSY); + return NULL; + } + tpa_info = &rxr->rx_tpa[agg_id]; data = tpa_info->data; prefetch(data); @@ -1146,6 +1200,12 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_napi *bnapi, u32 *raw_cons, cons = rxcmp->rx_cmp_opaque; rx_buf = &rxr->rx_buf_ring[cons]; data = rx_buf->data; + if (unlikely(cons != rxr->rx_next_cons)) { + int rc1 = bnxt_discard_rx(bp, bnapi, raw_cons, rxcmp); + + bnxt_sched_reset(bp, rxr); + return rc1; + } prefetch(data); agg_bufs = (le32_to_cpu(rxcmp->rx_cmp_misc_v1) & RX_CMP_AGG_BUFS) >> @@ -1245,6 +1305,7 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_napi *bnapi, u32 *raw_cons, next_rx: rxr->rx_prod = NEXT_RX(prod); + rxr->rx_next_cons = NEXT_RX(cons); next_rx_no_prod: *raw_cons = tmp_raw_cons; @@ -2486,6 +2547,7 @@ static void bnxt_clear_ring_indices(struct bnxt *bp) rxr->rx_prod = 0; rxr->rx_agg_prod = 0; rxr->rx_sw_agg_prod = 0; + rxr->rx_next_cons = 0; } } } @@ -4462,6 +4524,7 @@ static void bnxt_enable_napi(struct bnxt *bp) int i; for (i = 0; i < bp->cp_nr_rings; i++) { + bp->bnapi[i]->in_reset = false; bnxt_enable_poll(bp->bnapi[i]); napi_enable(&bp->bnapi[i]->napi); } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 8b823ff558ff..de9d53eee3dd 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -584,6 +584,7 @@ struct bnxt_rx_ring_info { u16 rx_prod; u16 rx_agg_prod; u16 rx_sw_agg_prod; + u16 rx_next_cons; void __iomem *rx_doorbell; void __iomem *rx_agg_doorbell; @@ -636,6 +637,7 @@ struct bnxt_napi { #ifdef CONFIG_NET_RX_BUSY_POLL atomic_t poll_state; #endif + bool in_reset; }; #ifdef CONFIG_NET_RX_BUSY_POLL diff --git a/drivers/net/ethernet/marvell/Kconfig b/drivers/net/ethernet/marvell/Kconfig index b5c6d42daa12..2664827ddecd 100644 --- a/drivers/net/ethernet/marvell/Kconfig +++ b/drivers/net/ethernet/marvell/Kconfig @@ -68,7 +68,7 @@ config MVNETA config MVNETA_BM tristate - default y if MVNETA=y && MVNETA_BM_ENABLE + default y if MVNETA=y && MVNETA_BM_ENABLE!=n default MVNETA_BM_ENABLE select HWBM help diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c index cda9e604a95f..0844b7c75767 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c @@ -1417,6 +1417,7 @@ void qlcnic_83xx_get_minidump_template(struct qlcnic_adapter *adapter) struct qlcnic_fw_dump *fw_dump = &ahw->fw_dump; struct pci_dev *pdev = adapter->pdev; bool extended = false; + int ret; prev_version = adapter->fw_version; current_version = qlcnic_83xx_get_fw_version(adapter); @@ -1427,8 +1428,11 @@ void qlcnic_83xx_get_minidump_template(struct qlcnic_adapter *adapter) if (qlcnic_83xx_md_check_extended_dump_capability(adapter)) extended = !qlcnic_83xx_extend_md_capab(adapter); - if (!qlcnic_fw_cmd_get_minidump_temp(adapter)) - dev_info(&pdev->dev, "Supports FW dump capability\n"); + ret = qlcnic_fw_cmd_get_minidump_temp(adapter); + if (ret) + return; + + dev_info(&pdev->dev, "Supports FW dump capability\n"); /* Once we have minidump template with extended iSCSI dump * capability, update the minidump capture mask to 0x1f as diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c index 72c9f1f352b4..7c7830722ea2 100644 --- a/drivers/net/hamradio/baycom_epp.c +++ b/drivers/net/hamradio/baycom_epp.c @@ -635,10 +635,10 @@ static int receive(struct net_device *dev, int cnt) #ifdef __i386__ #include <asm/msr.h> -#define GETTICK(x) \ -({ \ - if (cpu_has_tsc) \ - x = (unsigned int)rdtsc(); \ +#define GETTICK(x) \ +({ \ + if (boot_cpu_has(X86_FEATURE_TSC)) \ + x = (unsigned int)rdtsc(); \ }) #else /* __i386__ */ #define GETTICK(x) diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index b42f26029225..4412a57ec862 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -711,6 +711,7 @@ static void xenvif_tx_err(struct xenvif_queue *queue, if (cons == end) break; RING_COPY_REQUEST(&queue->tx, cons++, txp); + extra_count = 0; /* only the first frag can have extras */ } while (1); queue->tx.req_cons = cons; } diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index f70090897fdf..f2d01d4d9364 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -847,6 +847,14 @@ static int cpu_pmu_init(struct arm_pmu *cpu_pmu) if (!platform_get_irq(cpu_pmu->plat_device, 0)) cpu_pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; + /* + * This is a CPU PMU potentially in a heterogeneous configuration (e.g. + * big.LITTLE). This is not an uncore PMU, and we have taken ctx + * sharing into account (e.g. with our pmu::filter_match callback and + * pmu::event_init group validation). + */ + cpu_pmu->pmu.capabilities |= PERF_PMU_CAP_HETEROGENEOUS_CPUS; + return 0; out_unregister: diff --git a/drivers/pinctrl/pinctrl-at91-pio4.c b/drivers/pinctrl/pinctrl-at91-pio4.c index 4429312e848d..2c447130b954 100644 --- a/drivers/pinctrl/pinctrl-at91-pio4.c +++ b/drivers/pinctrl/pinctrl-at91-pio4.c @@ -722,9 +722,11 @@ static int atmel_conf_pin_config_group_set(struct pinctrl_dev *pctldev, break; case PIN_CONFIG_BIAS_PULL_UP: conf |= ATMEL_PIO_PUEN_MASK; + conf &= (~ATMEL_PIO_PDEN_MASK); break; case PIN_CONFIG_BIAS_PULL_DOWN: conf |= ATMEL_PIO_PDEN_MASK; + conf &= (~ATMEL_PIO_PUEN_MASK); break; case PIN_CONFIG_DRIVE_OPEN_DRAIN: if (arg == 0) diff --git a/drivers/pnp/pnpbios/core.c b/drivers/pnp/pnpbios/core.c index facd43b8516c..81603d99082b 100644 --- a/drivers/pnp/pnpbios/core.c +++ b/drivers/pnp/pnpbios/core.c @@ -521,10 +521,11 @@ static int __init pnpbios_init(void) int ret; if (pnpbios_disabled || dmi_check_system(pnpbios_dmi_table) || - paravirt_enabled()) { + arch_pnpbios_disabled()) { printk(KERN_INFO "PnPBIOS: Disabled\n"); return -ENODEV; } + #ifdef CONFIG_PNPACPI if (!acpi_disabled && !pnpacpi_disabled) { pnpbios_disabled = 1; diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c index 8fad0a7044d3..f2201d42a9cd 100644 --- a/drivers/powercap/intel_rapl.c +++ b/drivers/powercap/intel_rapl.c @@ -34,6 +34,9 @@ #include <asm/processor.h> #include <asm/cpu_device_id.h> +/* Local defines */ +#define MSR_PLATFORM_POWER_LIMIT 0x0000065C + /* bitmasks for RAPL MSRs, used by primitive access functions */ #define ENERGY_STATUS_MASK 0xffffffff @@ -86,6 +89,7 @@ enum rapl_domain_type { RAPL_DOMAIN_PP0, /* core power plane */ RAPL_DOMAIN_PP1, /* graphics uncore */ RAPL_DOMAIN_DRAM,/* DRAM control_type */ + RAPL_DOMAIN_PLATFORM, /* PSys control_type */ RAPL_DOMAIN_MAX, }; @@ -251,9 +255,11 @@ static const char * const rapl_domain_names[] = { "core", "uncore", "dram", + "psys", }; static struct powercap_control_type *control_type; /* PowerCap Controller */ +static struct rapl_domain *platform_rapl_domain; /* Platform (PSys) domain */ /* caller to ensure CPU hotplug lock is held */ static struct rapl_package *find_package_by_id(int id) @@ -409,6 +415,14 @@ static const struct powercap_zone_ops zone_ops[] = { .set_enable = set_domain_enable, .get_enable = get_domain_enable, }, + /* RAPL_DOMAIN_PLATFORM */ + { + .get_energy_uj = get_energy_counter, + .get_max_energy_range_uj = get_max_energy_counter, + .release = release_zone, + .set_enable = set_domain_enable, + .get_enable = get_domain_enable, + }, }; static int set_power_limit(struct powercap_zone *power_zone, int id, @@ -1160,6 +1174,13 @@ static int rapl_unregister_powercap(void) powercap_unregister_zone(control_type, &rd_package->power_zone); } + + if (platform_rapl_domain) { + powercap_unregister_zone(control_type, + &platform_rapl_domain->power_zone); + kfree(platform_rapl_domain); + } + powercap_unregister_control_type(control_type); return 0; @@ -1239,6 +1260,47 @@ err_cleanup: return ret; } +static int rapl_register_psys(void) +{ + struct rapl_domain *rd; + struct powercap_zone *power_zone; + u64 val; + + if (rdmsrl_safe_on_cpu(0, MSR_PLATFORM_ENERGY_STATUS, &val) || !val) + return -ENODEV; + + if (rdmsrl_safe_on_cpu(0, MSR_PLATFORM_POWER_LIMIT, &val) || !val) + return -ENODEV; + + rd = kzalloc(sizeof(*rd), GFP_KERNEL); + if (!rd) + return -ENOMEM; + + rd->name = rapl_domain_names[RAPL_DOMAIN_PLATFORM]; + rd->id = RAPL_DOMAIN_PLATFORM; + rd->msrs[0] = MSR_PLATFORM_POWER_LIMIT; + rd->msrs[1] = MSR_PLATFORM_ENERGY_STATUS; + rd->rpl[0].prim_id = PL1_ENABLE; + rd->rpl[0].name = pl1_name; + rd->rpl[1].prim_id = PL2_ENABLE; + rd->rpl[1].name = pl2_name; + rd->rp = find_package_by_id(0); + + power_zone = powercap_register_zone(&rd->power_zone, control_type, + "psys", NULL, + &zone_ops[RAPL_DOMAIN_PLATFORM], + 2, &constraint_ops); + + if (IS_ERR(power_zone)) { + kfree(rd); + return PTR_ERR(power_zone); + } + + platform_rapl_domain = rd; + + return 0; +} + static int rapl_register_powercap(void) { struct rapl_domain *rd; @@ -1255,6 +1317,10 @@ static int rapl_register_powercap(void) list_for_each_entry(rp, &rapl_packages, plist) if (rapl_package_register_powercap(rp)) goto err_cleanup_package; + + /* Don't bail out if PSys is not supported */ + rapl_register_psys(); + return ret; err_cleanup_package: @@ -1289,6 +1355,9 @@ static int rapl_check_domain(int cpu, int domain) case RAPL_DOMAIN_DRAM: msr = MSR_DRAM_ENERGY_STATUS; break; + case RAPL_DOMAIN_PLATFORM: + /* PSYS(PLATFORM) is not a CPU domain, so avoid printng error */ + return -EINVAL; default: pr_err("invalid domain id %d\n", domain); return -EINVAL; diff --git a/drivers/regulator/axp20x-regulator.c b/drivers/regulator/axp20x-regulator.c index 40cd894e4df5..514a5e8fdbab 100644 --- a/drivers/regulator/axp20x-regulator.c +++ b/drivers/regulator/axp20x-regulator.c @@ -157,7 +157,9 @@ static struct regulator_ops axp20x_ops_sw = { static const struct regulator_linear_range axp20x_ldo4_ranges[] = { REGULATOR_LINEAR_RANGE(1250000, 0x0, 0x0, 0), REGULATOR_LINEAR_RANGE(1300000, 0x1, 0x8, 100000), - REGULATOR_LINEAR_RANGE(2500000, 0x9, 0xf, 100000), + REGULATOR_LINEAR_RANGE(2500000, 0x9, 0x9, 0), + REGULATOR_LINEAR_RANGE(2700000, 0xa, 0xb, 100000), + REGULATOR_LINEAR_RANGE(3000000, 0xc, 0xf, 100000), }; static const struct regulator_desc axp20x_regulators[] = { @@ -215,10 +217,14 @@ static const struct regulator_desc axp22x_regulators[] = { AXP22X_ELDO2_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL2, BIT(1)), AXP_DESC(AXP22X, ELDO3, "eldo3", "eldoin", 700, 3300, 100, AXP22X_ELDO3_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL2, BIT(2)), - AXP_DESC_IO(AXP22X, LDO_IO0, "ldo_io0", "ips", 1800, 3300, 100, + /* Note the datasheet only guarantees reliable operation up to + * 3.3V, this needs to be enforced via dts provided constraints */ + AXP_DESC_IO(AXP22X, LDO_IO0, "ldo_io0", "ips", 700, 3800, 100, AXP22X_LDO_IO0_V_OUT, 0x1f, AXP20X_GPIO0_CTRL, 0x07, AXP22X_IO_ENABLED, AXP22X_IO_DISABLED), - AXP_DESC_IO(AXP22X, LDO_IO1, "ldo_io1", "ips", 1800, 3300, 100, + /* Note the datasheet only guarantees reliable operation up to + * 3.3V, this needs to be enforced via dts provided constraints */ + AXP_DESC_IO(AXP22X, LDO_IO1, "ldo_io1", "ips", 700, 3800, 100, AXP22X_LDO_IO1_V_OUT, 0x1f, AXP20X_GPIO1_CTRL, 0x07, AXP22X_IO_ENABLED, AXP22X_IO_DISABLED), AXP_DESC_FIXED(AXP22X, RTC_LDO, "rtc_ldo", "ips", 3000), diff --git a/drivers/regulator/da9063-regulator.c b/drivers/regulator/da9063-regulator.c index ed9e7e96f877..c6af343f54ea 100644 --- a/drivers/regulator/da9063-regulator.c +++ b/drivers/regulator/da9063-regulator.c @@ -900,4 +900,4 @@ module_exit(da9063_regulator_cleanup); MODULE_AUTHOR("Krystian Garbaciak <krystian.garbaciak@diasemi.com>"); MODULE_DESCRIPTION("DA9063 regulators driver"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("paltform:" DA9063_DRVNAME_REGULATORS); +MODULE_ALIAS("platform:" DA9063_DRVNAME_REGULATORS); diff --git a/drivers/regulator/gpio-regulator.c b/drivers/regulator/gpio-regulator.c index a8718e98674a..83e89e5d4752 100644 --- a/drivers/regulator/gpio-regulator.c +++ b/drivers/regulator/gpio-regulator.c @@ -162,6 +162,8 @@ of_get_gpio_regulator_config(struct device *dev, struct device_node *np, of_property_read_u32(np, "startup-delay-us", &config->startup_delay); config->enable_gpio = of_get_named_gpio(np, "enable-gpio", 0); + if (config->enable_gpio == -EPROBE_DEFER) + return ERR_PTR(-EPROBE_DEFER); /* Fetch GPIOs. - optional property*/ ret = of_gpio_count(np); diff --git a/drivers/regulator/s2mps11.c b/drivers/regulator/s2mps11.c index d24e2c783dc5..6dfa3502e1f1 100644 --- a/drivers/regulator/s2mps11.c +++ b/drivers/regulator/s2mps11.c @@ -308,7 +308,7 @@ static struct regulator_ops s2mps11_buck_ops = { .enable_mask = S2MPS11_ENABLE_MASK \ } -#define regulator_desc_s2mps11_buck6_10(num, min, step) { \ +#define regulator_desc_s2mps11_buck67810(num, min, step) { \ .name = "BUCK"#num, \ .id = S2MPS11_BUCK##num, \ .ops = &s2mps11_buck_ops, \ @@ -324,6 +324,22 @@ static struct regulator_ops s2mps11_buck_ops = { .enable_mask = S2MPS11_ENABLE_MASK \ } +#define regulator_desc_s2mps11_buck9 { \ + .name = "BUCK9", \ + .id = S2MPS11_BUCK9, \ + .ops = &s2mps11_buck_ops, \ + .type = REGULATOR_VOLTAGE, \ + .owner = THIS_MODULE, \ + .min_uV = MIN_3000_MV, \ + .uV_step = STEP_25_MV, \ + .n_voltages = S2MPS11_BUCK9_N_VOLTAGES, \ + .ramp_delay = S2MPS11_RAMP_DELAY, \ + .vsel_reg = S2MPS11_REG_B9CTRL2, \ + .vsel_mask = S2MPS11_BUCK9_VSEL_MASK, \ + .enable_reg = S2MPS11_REG_B9CTRL1, \ + .enable_mask = S2MPS11_ENABLE_MASK \ +} + static const struct regulator_desc s2mps11_regulators[] = { regulator_desc_s2mps11_ldo(1, STEP_25_MV), regulator_desc_s2mps11_ldo(2, STEP_50_MV), @@ -368,11 +384,11 @@ static const struct regulator_desc s2mps11_regulators[] = { regulator_desc_s2mps11_buck1_4(3), regulator_desc_s2mps11_buck1_4(4), regulator_desc_s2mps11_buck5, - regulator_desc_s2mps11_buck6_10(6, MIN_600_MV, STEP_6_25_MV), - regulator_desc_s2mps11_buck6_10(7, MIN_600_MV, STEP_6_25_MV), - regulator_desc_s2mps11_buck6_10(8, MIN_600_MV, STEP_6_25_MV), - regulator_desc_s2mps11_buck6_10(9, MIN_3000_MV, STEP_25_MV), - regulator_desc_s2mps11_buck6_10(10, MIN_750_MV, STEP_12_5_MV), + regulator_desc_s2mps11_buck67810(6, MIN_600_MV, STEP_6_25_MV), + regulator_desc_s2mps11_buck67810(7, MIN_600_MV, STEP_6_25_MV), + regulator_desc_s2mps11_buck67810(8, MIN_600_MV, STEP_6_25_MV), + regulator_desc_s2mps11_buck9, + regulator_desc_s2mps11_buck67810(10, MIN_750_MV, STEP_12_5_MV), }; static struct regulator_ops s2mps14_reg_ops; diff --git a/drivers/staging/unisys/visorbus/visorchipset.c b/drivers/staging/unisys/visorbus/visorchipset.c index 5fbda7b218c7..9cf4f8463c4e 100644 --- a/drivers/staging/unisys/visorbus/visorchipset.c +++ b/drivers/staging/unisys/visorbus/visorchipset.c @@ -2425,7 +2425,7 @@ static __init uint32_t visorutil_spar_detect(void) { unsigned int eax, ebx, ecx, edx; - if (cpu_has_hypervisor) { + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { /* check the ID */ cpuid(UNISYS_SPAR_LEAF_ID, &eax, &ebx, &ecx, &edx); return (ebx == UNISYS_SPAR_ID_EBX) && diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig index 983280e8d93f..e5a391aecde1 100644 --- a/drivers/video/fbdev/Kconfig +++ b/drivers/video/fbdev/Kconfig @@ -761,7 +761,7 @@ config FB_VESA config FB_EFI bool "EFI-based Framebuffer Support" - depends on (FB = y) && X86 && EFI + depends on (FB = y) && !IA64 && EFI select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT diff --git a/drivers/video/fbdev/efifb.c b/drivers/video/fbdev/efifb.c index 95d293b7445a..f4c045c0051c 100644 --- a/drivers/video/fbdev/efifb.c +++ b/drivers/video/fbdev/efifb.c @@ -6,16 +6,14 @@ * */ -#include <linux/module.h> #include <linux/kernel.h> +#include <linux/efi.h> #include <linux/errno.h> #include <linux/fb.h> #include <linux/platform_device.h> #include <linux/screen_info.h> -#include <linux/dmi.h> -#include <linux/pci.h> #include <video/vga.h> -#include <asm/sysfb.h> +#include <asm/efi.h> static bool request_mem_succeeded = false; @@ -85,21 +83,13 @@ static struct fb_ops efifb_ops = { static int efifb_setup(char *options) { char *this_opt; - int i; if (options && *options) { while ((this_opt = strsep(&options, ",")) != NULL) { if (!*this_opt) continue; - for (i = 0; i < M_UNKNOWN; i++) { - if (efifb_dmi_list[i].base != 0 && - !strcmp(this_opt, efifb_dmi_list[i].optname)) { - screen_info.lfb_base = efifb_dmi_list[i].base; - screen_info.lfb_linelength = efifb_dmi_list[i].stride; - screen_info.lfb_width = efifb_dmi_list[i].width; - screen_info.lfb_height = efifb_dmi_list[i].height; - } - } + efifb_setup_from_dmi(&screen_info, this_opt); + if (!strncmp(this_opt, "base:", 5)) screen_info.lfb_base = simple_strtoul(this_opt+5, NULL, 0); else if (!strncmp(this_opt, "stride:", 7)) @@ -338,5 +328,4 @@ static struct platform_driver efifb_driver = { .remove = efifb_remove, }; -module_platform_driver(efifb_driver); -MODULE_LICENSE("GPL"); +builtin_platform_driver(efifb_driver); diff --git a/drivers/xen/efi.c b/drivers/xen/efi.c index be7e56a338e8..e9d2135445c1 100644 --- a/drivers/xen/efi.c +++ b/drivers/xen/efi.c @@ -316,7 +316,6 @@ static const struct efi efi_xen __initconst = { .get_next_high_mono_count = xen_efi_get_next_high_mono_count, .reset_system = NULL, /* Functionality provided by Xen. */ .set_virtual_address_map = NULL, /* Not used under Xen. */ - .memmap = NULL, /* Not used under Xen. */ .flags = 0 /* Initialized later. */ }; diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index feef8a9c4de7..f02404052b7b 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -112,7 +112,6 @@ static int ecryptfs_readdir(struct file *file, struct dir_context *ctx) .sb = inode->i_sb, }; lower_file = ecryptfs_file_to_lower(file); - lower_file->f_pos = ctx->pos; rc = iterate_dir(lower_file, &buf.ctx); ctx->pos = buf.ctx.pos; if (rc < 0) @@ -223,14 +222,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file) } ecryptfs_set_file_lower( file, ecryptfs_inode_to_private(inode)->lower_file); - if (d_is_dir(ecryptfs_dentry)) { - ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); - mutex_lock(&crypt_stat->cs_mutex); - crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); - mutex_unlock(&crypt_stat->cs_mutex); - rc = 0; - goto out; - } rc = read_or_initialize_metadata(ecryptfs_dentry); if (rc) goto out_put; @@ -247,6 +238,45 @@ out: return rc; } +/** + * ecryptfs_dir_open + * @inode: inode speciying file to open + * @file: Structure to return filled in + * + * Opens the file specified by inode. + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_dir_open(struct inode *inode, struct file *file) +{ + struct dentry *ecryptfs_dentry = file->f_path.dentry; + /* Private value of ecryptfs_dentry allocated in + * ecryptfs_lookup() */ + struct ecryptfs_file_info *file_info; + struct file *lower_file; + + /* Released in ecryptfs_release or end of function if failure */ + file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL); + ecryptfs_set_file_private(file, file_info); + if (unlikely(!file_info)) { + ecryptfs_printk(KERN_ERR, + "Error attempting to allocate memory\n"); + return -ENOMEM; + } + lower_file = dentry_open(ecryptfs_dentry_to_lower_path(ecryptfs_dentry), + file->f_flags, current_cred()); + if (IS_ERR(lower_file)) { + printk(KERN_ERR "%s: Error attempting to initialize " + "the lower file for the dentry with name " + "[%pd]; rc = [%ld]\n", __func__, + ecryptfs_dentry, PTR_ERR(lower_file)); + kmem_cache_free(ecryptfs_file_info_cache, file_info); + return PTR_ERR(lower_file); + } + ecryptfs_set_file_lower(file, lower_file); + return 0; +} + static int ecryptfs_flush(struct file *file, fl_owner_t td) { struct file *lower_file = ecryptfs_file_to_lower(file); @@ -267,6 +297,19 @@ static int ecryptfs_release(struct inode *inode, struct file *file) return 0; } +static int ecryptfs_dir_release(struct inode *inode, struct file *file) +{ + fput(ecryptfs_file_to_lower(file)); + kmem_cache_free(ecryptfs_file_info_cache, + ecryptfs_file_to_private(file)); + return 0; +} + +static loff_t ecryptfs_dir_llseek(struct file *file, loff_t offset, int whence) +{ + return vfs_llseek(ecryptfs_file_to_lower(file), offset, whence); +} + static int ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { @@ -346,20 +389,16 @@ const struct file_operations ecryptfs_dir_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, #endif - .open = ecryptfs_open, - .flush = ecryptfs_flush, - .release = ecryptfs_release, + .open = ecryptfs_dir_open, + .release = ecryptfs_dir_release, .fsync = ecryptfs_fsync, - .fasync = ecryptfs_fasync, - .splice_read = generic_file_splice_read, - .llseek = default_llseek, + .llseek = ecryptfs_dir_llseek, }; const struct file_operations ecryptfs_main_fops = { .llseek = generic_file_llseek, .read_iter = ecryptfs_read_update_atime, .write_iter = generic_file_write_iter, - .iterate = ecryptfs_readdir, .unlocked_ioctl = ecryptfs_unlocked_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index d48e0d261d78..5f22e74bbade 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -157,7 +157,7 @@ efivarfs_ioc_setxflags(struct file *file, void __user *arg) return 0; } -long +static long efivarfs_file_ioctl(struct file *file, unsigned int cmd, unsigned long p) { void __user *arg = (void __user *)p; diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 553c5d2db4a4..9cb54a38832d 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -216,8 +216,7 @@ static int efivarfs_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&efivarfs_list); - err = efivar_init(efivarfs_callback, (void *)sb, false, - true, &efivarfs_list); + err = efivar_init(efivarfs_callback, (void *)sb, true, &efivarfs_list); if (err) __efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL); diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index 5384ceb35b1c..98b3eb7d8eaf 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -203,6 +203,8 @@ int get_rock_ridge_filename(struct iso_directory_record *de, int retnamlen = 0; int truncate = 0; int ret = 0; + char *p; + int len; if (!ISOFS_SB(inode->i_sb)->s_rock) return 0; @@ -267,12 +269,17 @@ repeat: rr->u.NM.flags); break; } - if ((strlen(retname) + rr->len - 5) >= 254) { + len = rr->len - 5; + if (retnamlen + len >= 254) { truncate = 1; break; } - strncat(retname, rr->u.NM.name, rr->len - 5); - retnamlen += rr->len - 5; + p = memchr(rr->u.NM.name, '\0', len); + if (unlikely(p)) + len = p - rr->u.NM.name; + memcpy(retname + retnamlen, rr->u.NM.name, len); + retnamlen += len; + retname[retnamlen] = '\0'; break; case SIG('R', 'E'): kfree(rs.buffer); diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 03b688d19f69..37f9678ae4df 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -153,9 +153,9 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, p = buf + len + nlen; *p = '\0'; for (kn = kn_to; kn != common; kn = kn->parent) { - nlen = strlen(kn->name); - p -= nlen; - memcpy(p, kn->name, nlen); + size_t tmp = strlen(kn->name); + p -= tmp; + memcpy(p, kn->name, tmp); *(--p) = '/'; } diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index f73541fbe7af..3d670a3678f2 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -15,6 +15,7 @@ #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/namei.h> +#include <linux/seq_file.h> #include "kernfs-internal.h" @@ -40,6 +41,19 @@ static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry) return 0; } +static int kernfs_sop_show_path(struct seq_file *sf, struct dentry *dentry) +{ + struct kernfs_node *node = dentry->d_fsdata; + struct kernfs_root *root = kernfs_root(node); + struct kernfs_syscall_ops *scops = root->syscall_ops; + + if (scops && scops->show_path) + return scops->show_path(sf, node, root); + + seq_dentry(sf, dentry, " \t\n\\"); + return 0; +} + const struct super_operations kernfs_sops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, @@ -47,6 +61,7 @@ const struct super_operations kernfs_sops = { .remount_fs = kernfs_sop_remount_fs, .show_options = kernfs_sop_show_options, + .show_path = kernfs_sop_show_path, }; /** diff --git a/fs/namei.c b/fs/namei.c index 1d9ca2d5dff6..42f8ca038254 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1794,30 +1794,49 @@ static inline unsigned int fold_hash(unsigned long hash) return hash_64(hash, 32); } +/* + * This is George Marsaglia's XORSHIFT generator. + * It implements a maximum-period LFSR in only a few + * instructions. It also has the property (required + * by hash_name()) that mix_hash(0) = 0. + */ +static inline unsigned long mix_hash(unsigned long hash) +{ + hash ^= hash << 13; + hash ^= hash >> 7; + hash ^= hash << 17; + return hash; +} + #else /* 32-bit case */ #define fold_hash(x) (x) +static inline unsigned long mix_hash(unsigned long hash) +{ + hash ^= hash << 13; + hash ^= hash >> 17; + hash ^= hash << 5; + return hash; +} + #endif unsigned int full_name_hash(const unsigned char *name, unsigned int len) { - unsigned long a, mask; - unsigned long hash = 0; + unsigned long a, hash = 0; for (;;) { a = load_unaligned_zeropad(name); if (len < sizeof(unsigned long)) break; - hash += a; - hash *= 9; + hash = mix_hash(hash + a); name += sizeof(unsigned long); len -= sizeof(unsigned long); if (!len) goto done; } - mask = bytemask_from_count(len); - hash += mask & a; + hash += a & bytemask_from_count(len); done: return fold_hash(hash); } @@ -1835,7 +1854,7 @@ static inline u64 hash_name(const char *name) hash = a = 0; len = -sizeof(unsigned long); do { - hash = (hash + a) * 9; + hash = mix_hash(hash + a); len += sizeof(unsigned long); a = load_unaligned_zeropad(name+len); b = a ^ REPEAT_BYTE('/'); @@ -2267,6 +2286,33 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, EXPORT_SYMBOL(vfs_path_lookup); /** + * lookup_hash - lookup single pathname component on already hashed name + * @name: name and hash to lookup + * @base: base directory to lookup from + * + * The name must have been verified and hashed (see lookup_one_len()). Using + * this after just full_name_hash() is unsafe. + * + * This function also doesn't check for search permission on base directory. + * + * Use lookup_one_len_unlocked() instead, unless you really know what you are + * doing. + * + * Do not hold i_mutex; this helper takes i_mutex if necessary. + */ +struct dentry *lookup_hash(const struct qstr *name, struct dentry *base) +{ + struct dentry *ret; + + ret = lookup_dcache(name, base, 0); + if (!ret) + ret = lookup_slow(name, base, 0); + + return ret; +} +EXPORT_SYMBOL(lookup_hash); + +/** * lookup_one_len - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from @@ -2337,7 +2383,6 @@ struct dentry *lookup_one_len_unlocked(const char *name, struct qstr this; unsigned int c; int err; - struct dentry *ret; this.name = name; this.len = len; @@ -2369,10 +2414,7 @@ struct dentry *lookup_one_len_unlocked(const char *name, if (err) return ERR_PTR(err); - ret = lookup_dcache(&this, base, 0); - if (!ret) - ret = lookup_slow(&this, base, 0); - return ret; + return lookup_hash(&this, base); } EXPORT_SYMBOL(lookup_one_len_unlocked); @@ -2942,22 +2984,10 @@ no_open: dentry = lookup_real(dir, dentry, nd->flags); if (IS_ERR(dentry)) return PTR_ERR(dentry); - - if (create_error) { - int open_flag = op->open_flag; - - error = create_error; - if ((open_flag & O_EXCL)) { - if (!dentry->d_inode) - goto out; - } else if (!dentry->d_inode) { - goto out; - } else if ((open_flag & O_TRUNC) && - d_is_reg(dentry)) { - goto out; - } - /* will fail later, go on to get the right error */ - } + } + if (create_error && !dentry->d_inode) { + error = create_error; + goto out; } looked_up: path->dentry = dentry; @@ -4213,7 +4243,11 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, bool new_is_dir = false; unsigned max_links = new_dir->i_sb->s_max_links; - if (source == target) + /* + * Check source == target. + * On overlayfs need to look at underlying inodes. + */ + if (vfs_select_inode(old_dentry, 0) == vfs_select_inode(new_dentry, 0)) return 0; error = may_delete(old_dir, old_dentry, is_dir); diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 0cdf497c91ef..2162434728c0 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -322,3 +322,90 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) brelse(di_bh); return acl; } + +int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct posix_acl *acl; + int ret; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return 0; + + acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh); + if (IS_ERR(acl) || !acl) + return PTR_ERR(acl); + ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (ret) + return ret; + ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS, + acl, NULL, NULL); + posix_acl_release(acl); + return ret; +} + +/* + * Initialize the ACLs of a new inode. If parent directory has default ACL, + * then clone to new inode. Called from ocfs2_mknod. + */ +int ocfs2_init_acl(handle_t *handle, + struct inode *inode, + struct inode *dir, + struct buffer_head *di_bh, + struct buffer_head *dir_bh, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct posix_acl *acl = NULL; + int ret = 0, ret2; + umode_t mode; + + if (!S_ISLNK(inode->i_mode)) { + if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { + acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT, + dir_bh); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + if (!acl) { + mode = inode->i_mode & ~current_umask(); + ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode); + if (ret) { + mlog_errno(ret); + goto cleanup; + } + } + } + if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { + if (S_ISDIR(inode->i_mode)) { + ret = ocfs2_set_acl(handle, inode, di_bh, + ACL_TYPE_DEFAULT, acl, + meta_ac, data_ac); + if (ret) + goto cleanup; + } + mode = inode->i_mode; + ret = __posix_acl_create(&acl, GFP_NOFS, &mode); + if (ret < 0) + return ret; + + ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode); + if (ret2) { + mlog_errno(ret2); + ret = ret2; + goto cleanup; + } + if (ret > 0) { + ret = ocfs2_set_acl(handle, inode, + di_bh, ACL_TYPE_ACCESS, + acl, meta_ac, data_ac); + } + } +cleanup: + posix_acl_release(acl); + return ret; +} diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 3fce68d08625..2783a75b3999 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -35,5 +35,10 @@ int ocfs2_set_acl(handle_t *handle, struct posix_acl *acl, struct ocfs2_alloc_context *meta_ac, struct ocfs2_alloc_context *data_ac); +extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *); +extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, + struct buffer_head *, struct buffer_head *, + struct ocfs2_alloc_context *, + struct ocfs2_alloc_context *); #endif /* OCFS2_ACL_H */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 5308841756be..59cce53c91d8 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1268,20 +1268,20 @@ bail_unlock_rw: if (size_change) ocfs2_rw_unlock(inode, 1); bail: - brelse(bh); /* Release quota pointers in case we acquired them */ for (qtype = 0; qtype < OCFS2_MAXQUOTAS; qtype++) dqput(transfer_to[qtype]); if (!status && attr->ia_valid & ATTR_MODE) { - status = posix_acl_chmod(inode, inode->i_mode); + status = ocfs2_acl_chmod(inode, bh); if (status < 0) mlog_errno(status); } if (inode_locked) ocfs2_inode_unlock(inode, 1); + brelse(bh); return status; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 6b3e87189a64..a8f1225e6d9b 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -259,7 +259,6 @@ static int ocfs2_mknod(struct inode *dir, struct ocfs2_dir_lookup_result lookup = { NULL, }; sigset_t oldset; int did_block_signals = 0; - struct posix_acl *default_acl = NULL, *acl = NULL; struct ocfs2_dentry_lock *dl = NULL; trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name, @@ -367,12 +366,6 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } - status = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); - if (status) { - mlog_errno(status); - goto leave; - } - handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, S_ISDIR(mode), xattr_credits)); @@ -421,16 +414,8 @@ static int ocfs2_mknod(struct inode *dir, inc_nlink(dir); } - if (default_acl) { - status = ocfs2_set_acl(handle, inode, new_fe_bh, - ACL_TYPE_DEFAULT, default_acl, - meta_ac, data_ac); - } - if (!status && acl) { - status = ocfs2_set_acl(handle, inode, new_fe_bh, - ACL_TYPE_ACCESS, acl, - meta_ac, data_ac); - } + status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh, + meta_ac, data_ac); if (status < 0) { mlog_errno(status); @@ -472,10 +457,6 @@ static int ocfs2_mknod(struct inode *dir, d_instantiate(dentry, inode); status = 0; leave: - if (default_acl) - posix_acl_release(default_acl); - if (acl) - posix_acl_release(acl); if (status < 0 && did_quota_inode) dquot_free_inode(inode); if (handle) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 744d5d90c363..92bbe93bfe10 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4248,20 +4248,12 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, struct inode *inode = d_inode(old_dentry); struct buffer_head *old_bh = NULL; struct inode *new_orphan_inode = NULL; - struct posix_acl *default_acl, *acl; - umode_t mode; if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) return -EOPNOTSUPP; - mode = inode->i_mode; - error = posix_acl_create(dir, &mode, &default_acl, &acl); - if (error) { - mlog_errno(error); - return error; - } - error = ocfs2_create_inode_in_orphan(dir, mode, + error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, &new_orphan_inode); if (error) { mlog_errno(error); @@ -4300,16 +4292,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, /* If the security isn't preserved, we need to re-initialize them. */ if (!preserve) { error = ocfs2_init_security_and_acl(dir, new_orphan_inode, - &new_dentry->d_name, - default_acl, acl); + &new_dentry->d_name); if (error) mlog_errno(error); } out: - if (default_acl) - posix_acl_release(default_acl); - if (acl) - posix_acl_release(acl); if (!error) { error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, new_dentry); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 7d3d979f57d9..f19b7381a998 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7216,12 +7216,10 @@ out: */ int ocfs2_init_security_and_acl(struct inode *dir, struct inode *inode, - const struct qstr *qstr, - struct posix_acl *default_acl, - struct posix_acl *acl) + const struct qstr *qstr) { - struct buffer_head *dir_bh = NULL; int ret = 0; + struct buffer_head *dir_bh = NULL; ret = ocfs2_init_security_get(inode, dir, qstr, NULL); if (ret) { @@ -7234,11 +7232,9 @@ int ocfs2_init_security_and_acl(struct inode *dir, mlog_errno(ret); goto leave; } - - if (!ret && default_acl) - ret = ocfs2_iop_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); - if (!ret && acl) - ret = ocfs2_iop_set_acl(inode, acl, ACL_TYPE_ACCESS); + ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); + if (ret) + mlog_errno(ret); ocfs2_inode_unlock(dir, 0); brelse(dir_bh); diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index f10d5b93c366..1633cc15ea1f 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -94,7 +94,5 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, bool preserve_security); int ocfs2_init_security_and_acl(struct inode *dir, struct inode *inode, - const struct qstr *qstr, - struct posix_acl *default_acl, - struct posix_acl *acl); + const struct qstr *qstr); #endif /* OCFS2_XATTR_H */ diff --git a/fs/open.c b/fs/open.c index 17cb6b1dab75..081d3d6df74b 100644 --- a/fs/open.c +++ b/fs/open.c @@ -840,16 +840,12 @@ EXPORT_SYMBOL(file_path); int vfs_open(const struct path *path, struct file *file, const struct cred *cred) { - struct dentry *dentry = path->dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = vfs_select_inode(path->dentry, file->f_flags); - file->f_path = *path; - if (dentry->d_flags & DCACHE_OP_SELECT_INODE) { - inode = dentry->d_op->d_select_inode(dentry, file->f_flags); - if (IS_ERR(inode)) - return PTR_ERR(inode); - } + if (IS_ERR(inode)) + return PTR_ERR(inode); + file->f_path = *path; return do_dentry_open(file, inode, NULL, cred); } diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 5d972e6cd3fe..791235e03d17 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -411,9 +411,7 @@ static inline struct dentry *ovl_lookup_real(struct dentry *dir, { struct dentry *dentry; - inode_lock(dir->d_inode); - dentry = lookup_one_len(name->name, dir, name->len); - inode_unlock(dir->d_inode); + dentry = lookup_hash(name, dir); if (IS_ERR(dentry)) { if (PTR_ERR(dentry) == -ENOENT) diff --git a/fs/splice.c b/fs/splice.c index b018eb485019..dd9bf7e410d2 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1143,6 +1143,9 @@ static long do_splice_to(struct file *in, loff_t *ppos, if (unlikely(ret < 0)) return ret; + if (unlikely(len > MAX_RW_COUNT)) + len = MAX_RW_COUNT; + if (in->f_op->splice_read) splice_read = in->f_op->splice_read; else diff --git a/include/asm-generic/rwsem.h b/include/asm-generic/rwsem.h index d6d5dc98d7da..3fc94a046bf5 100644 --- a/include/asm-generic/rwsem.h +++ b/include/asm-generic/rwsem.h @@ -53,7 +53,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +static inline void __down_write(struct rw_semaphore *sem) { long tmp; @@ -63,9 +63,16 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) rwsem_down_write_failed(sem); } -static inline void __down_write(struct rw_semaphore *sem) +static inline int __down_write_killable(struct rw_semaphore *sem) { - __down_write_nested(sem, 0); + long tmp; + + tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, + (atomic_long_t *)&sem->count); + if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) + if (IS_ERR(rwsem_down_write_failed_killable(sem))) + return -EINTR; + return 0; } static inline int __down_write_trylock(struct rw_semaphore *sem) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 339125bb4d2c..6a67ab94b553 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -245,7 +245,9 @@ #define INIT_TASK_DATA(align) \ . = ALIGN(align); \ - *(.data..init_task) + VMLINUX_SYMBOL(__start_init_task) = .; \ + *(.data..init_task) \ + VMLINUX_SYMBOL(__end_init_task) = .; /* * Read only Data diff --git a/include/linux/atomic.h b/include/linux/atomic.h index 506c3531832e..e451534fe54d 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -560,11 +560,11 @@ static inline int atomic_dec_if_positive(atomic_t *v) /** * atomic_fetch_or - perform *p |= mask and return old value of *p - * @p: pointer to atomic_t * @mask: mask to OR on the atomic_t + * @p: pointer to atomic_t */ #ifndef atomic_fetch_or -static inline int atomic_fetch_or(atomic_t *p, int mask) +static inline int atomic_fetch_or(int mask, atomic_t *p) { int old, val = atomic_read(p); diff --git a/include/linux/cpu.h b/include/linux/cpu.h index f9b1fab4388a..21597dcac0e2 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -59,25 +59,7 @@ struct notifier_block; * CPU notifier priorities. */ enum { - /* - * SCHED_ACTIVE marks a cpu which is coming up active during - * CPU_ONLINE and CPU_DOWN_FAILED and must be the first - * notifier. CPUSET_ACTIVE adjusts cpuset according to - * cpu_active mask right after SCHED_ACTIVE. During - * CPU_DOWN_PREPARE, SCHED_INACTIVE and CPUSET_INACTIVE are - * ordered in the similar way. - * - * This ordering guarantees consistent cpu_active mask and - * migration behavior to all cpu notifiers. - */ - CPU_PRI_SCHED_ACTIVE = INT_MAX, - CPU_PRI_CPUSET_ACTIVE = INT_MAX - 1, - CPU_PRI_SCHED_INACTIVE = INT_MIN + 1, - CPU_PRI_CPUSET_INACTIVE = INT_MIN, - - /* migration should happen before other stuff but after perf */ CPU_PRI_PERF = 20, - CPU_PRI_MIGRATION = 10, /* bring up workqueues before normal notifiers and down after */ CPU_PRI_WORKQUEUE_UP = 5, diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 5d68e15e46b7..386374d19987 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -8,6 +8,7 @@ enum cpuhp_state { CPUHP_BRINGUP_CPU, CPUHP_AP_IDLE_DEAD, CPUHP_AP_OFFLINE, + CPUHP_AP_SCHED_STARTING, CPUHP_AP_NOTIFY_STARTING, CPUHP_AP_ONLINE, CPUHP_TEARDOWN_CPU, @@ -16,6 +17,7 @@ enum cpuhp_state { CPUHP_AP_NOTIFY_ONLINE, CPUHP_AP_ONLINE_DYN, CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 30, + CPUHP_AP_ACTIVE, CPUHP_ONLINE, }; diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 40cee6b77a93..e828cf65d7df 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -743,12 +743,10 @@ set_cpu_present(unsigned int cpu, bool present) static inline void set_cpu_online(unsigned int cpu, bool online) { - if (online) { + if (online) cpumask_set_cpu(cpu, &__cpu_online_mask); - cpumask_set_cpu(cpu, &__cpu_active_mask); - } else { + else cpumask_clear_cpu(cpu, &__cpu_online_mask); - } } static inline void diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 4bb4de8d95ea..7e9422cb5989 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -565,4 +565,16 @@ static inline struct dentry *d_real(struct dentry *dentry) return dentry; } +static inline struct inode *vfs_select_inode(struct dentry *dentry, + unsigned open_flags) +{ + struct inode *inode = d_inode(dentry); + + if (inode && unlikely(dentry->d_flags & DCACHE_OP_SELECT_INODE)) + inode = dentry->d_op->d_select_inode(dentry, open_flags); + + return inode; +} + + #endif /* __LINUX_DCACHE_H */ diff --git a/include/linux/efi.h b/include/linux/efi.h index 1626474567ac..df7acb51f3cc 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -21,6 +21,7 @@ #include <linux/pfn.h> #include <linux/pstore.h> #include <linux/reboot.h> +#include <linux/screen_info.h> #include <asm/page.h> @@ -124,6 +125,13 @@ typedef struct { } efi_capsule_header_t; /* + * EFI capsule flags + */ +#define EFI_CAPSULE_PERSIST_ACROSS_RESET 0x00010000 +#define EFI_CAPSULE_POPULATE_SYSTEM_TABLE 0x00020000 +#define EFI_CAPSULE_INITIATE_RESET 0x00040000 + +/* * Allocation types for calls to boottime->allocate_pages. */ #define EFI_ALLOCATE_ANY_PAGES 0 @@ -282,9 +290,10 @@ typedef struct { efi_status_t (*handle_protocol)(efi_handle_t, efi_guid_t *, void **); void *__reserved; void *register_protocol_notify; - void *locate_handle; + efi_status_t (*locate_handle)(int, efi_guid_t *, void *, + unsigned long *, efi_handle_t *); void *locate_device_path; - void *install_configuration_table; + efi_status_t (*install_configuration_table)(efi_guid_t *, void *); void *load_image; void *start_image; void *exit; @@ -623,6 +632,27 @@ void efi_native_runtime_setup(void); EFI_GUID(0x3152bca5, 0xeade, 0x433d, \ 0x86, 0x2e, 0xc0, 0x1c, 0xdc, 0x29, 0x1f, 0x44) +#define EFI_MEMORY_ATTRIBUTES_TABLE_GUID \ + EFI_GUID(0xdcfa911d, 0x26eb, 0x469f, \ + 0xa2, 0x20, 0x38, 0xb7, 0xdc, 0x46, 0x12, 0x20) + +#define EFI_CONSOLE_OUT_DEVICE_GUID \ + EFI_GUID(0xd3b36f2c, 0xd551, 0x11d4, \ + 0x9a, 0x46, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d) + +/* + * This GUID is used to pass to the kernel proper the struct screen_info + * structure that was populated by the stub based on the GOP protocol instance + * associated with ConOut + */ +#define LINUX_EFI_ARM_SCREEN_INFO_TABLE_GUID \ + EFI_GUID(0xe03fc20a, 0x85dc, 0x406e, \ + 0xb9, 0xe, 0x4a, 0xb5, 0x02, 0x37, 0x1d, 0x95) + +#define LINUX_EFI_LOADER_ENTRY_GUID \ + EFI_GUID(0x4a67b082, 0x0a4c, 0x41cf, \ + 0xb6, 0xc7, 0x44, 0x0b, 0x29, 0xbb, 0x8c, 0x4f) + typedef struct { efi_guid_t guid; u64 table; @@ -847,6 +877,14 @@ typedef struct { #define EFI_INVALID_TABLE_ADDR (~0UL) +typedef struct { + u32 version; + u32 num_entries; + u32 desc_size; + u32 reserved; + efi_memory_desc_t entry[0]; +} efi_memory_attributes_table_t; + /* * All runtime access to EFI goes through this structure: */ @@ -868,6 +906,7 @@ extern struct efi { unsigned long config_table; /* config tables */ unsigned long esrt; /* ESRT table */ unsigned long properties_table; /* properties table */ + unsigned long mem_attr_table; /* memory attributes table */ efi_get_time_t *get_time; efi_set_time_t *set_time; efi_get_wakeup_time_t *get_wakeup_time; @@ -883,7 +922,7 @@ extern struct efi { efi_get_next_high_mono_count_t *get_next_high_mono_count; efi_reset_system_t *reset_system; efi_set_virtual_address_map_t *set_virtual_address_map; - struct efi_memory_map *memmap; + struct efi_memory_map memmap; unsigned long flags; } efi; @@ -945,7 +984,6 @@ extern void efi_initialize_iomem_resources(struct resource *code_resource, extern void efi_get_time(struct timespec *now); extern void efi_reserve_boot_services(void); extern int efi_get_fdt_params(struct efi_fdt_params *params); -extern struct efi_memory_map memmap; extern struct kobject *efi_kobj; extern int efi_reboot_quirk_mode; @@ -957,12 +995,34 @@ extern void __init efi_fake_memmap(void); static inline void efi_fake_memmap(void) { } #endif +/* + * efi_memattr_perm_setter - arch specific callback function passed into + * efi_memattr_apply_permissions() that updates the + * mapping permissions described by the second + * argument in the page tables referred to by the + * first argument. + */ +typedef int (*efi_memattr_perm_setter)(struct mm_struct *, efi_memory_desc_t *); + +extern int efi_memattr_init(void); +extern int efi_memattr_apply_permissions(struct mm_struct *mm, + efi_memattr_perm_setter fn); + /* Iterate through an efi_memory_map */ -#define for_each_efi_memory_desc(m, md) \ +#define for_each_efi_memory_desc_in_map(m, md) \ for ((md) = (m)->map; \ (md) <= (efi_memory_desc_t *)((m)->map_end - (m)->desc_size); \ (md) = (void *)(md) + (m)->desc_size) +/** + * for_each_efi_memory_desc - iterate over descriptors in efi.memmap + * @md: the efi_memory_desc_t * iterator + * + * Once the loop finishes @md must not be accessed. + */ +#define for_each_efi_memory_desc(md) \ + for_each_efi_memory_desc_in_map(&efi.memmap, md) + /* * Format an EFI memory descriptor's type and attributes to a user-provided * character buffer, as per snprintf(), and return the buffer. @@ -1000,7 +1060,6 @@ extern int __init efi_setup_pcdp_console(char *); * possible, remove EFI-related code altogether. */ #define EFI_BOOT 0 /* Were we booted from EFI? */ -#define EFI_SYSTEM_TABLES 1 /* Can we use EFI system tables? */ #define EFI_CONFIG_TABLES 2 /* Can we use EFI config tables? */ #define EFI_RUNTIME_SERVICES 3 /* Can we use runtime services? */ #define EFI_MEMMAP 4 /* Can we use EFI memory map? */ @@ -1026,8 +1085,16 @@ static inline bool efi_enabled(int feature) } static inline void efi_reboot(enum reboot_mode reboot_mode, const char *__unused) {} + +static inline bool +efi_capsule_pending(int *reset_type) +{ + return false; +} #endif +extern int efi_status_to_err(efi_status_t status); + /* * Variable Attributes */ @@ -1180,6 +1247,80 @@ struct efi_simple_text_output_protocol { void *test_string; }; +#define PIXEL_RGB_RESERVED_8BIT_PER_COLOR 0 +#define PIXEL_BGR_RESERVED_8BIT_PER_COLOR 1 +#define PIXEL_BIT_MASK 2 +#define PIXEL_BLT_ONLY 3 +#define PIXEL_FORMAT_MAX 4 + +struct efi_pixel_bitmask { + u32 red_mask; + u32 green_mask; + u32 blue_mask; + u32 reserved_mask; +}; + +struct efi_graphics_output_mode_info { + u32 version; + u32 horizontal_resolution; + u32 vertical_resolution; + int pixel_format; + struct efi_pixel_bitmask pixel_information; + u32 pixels_per_scan_line; +} __packed; + +struct efi_graphics_output_protocol_mode_32 { + u32 max_mode; + u32 mode; + u32 info; + u32 size_of_info; + u64 frame_buffer_base; + u32 frame_buffer_size; +} __packed; + +struct efi_graphics_output_protocol_mode_64 { + u32 max_mode; + u32 mode; + u64 info; + u64 size_of_info; + u64 frame_buffer_base; + u64 frame_buffer_size; +} __packed; + +struct efi_graphics_output_protocol_mode { + u32 max_mode; + u32 mode; + unsigned long info; + unsigned long size_of_info; + u64 frame_buffer_base; + unsigned long frame_buffer_size; +} __packed; + +struct efi_graphics_output_protocol_32 { + u32 query_mode; + u32 set_mode; + u32 blt; + u32 mode; +}; + +struct efi_graphics_output_protocol_64 { + u64 query_mode; + u64 set_mode; + u64 blt; + u64 mode; +}; + +struct efi_graphics_output_protocol { + unsigned long query_mode; + unsigned long set_mode; + unsigned long blt; + struct efi_graphics_output_protocol_mode *mode; +}; + +typedef efi_status_t (*efi_graphics_output_protocol_query_mode)( + struct efi_graphics_output_protocol *, u32, unsigned long *, + struct efi_graphics_output_mode_info **); + extern struct list_head efivar_sysfs_list; static inline void @@ -1195,8 +1336,7 @@ int efivars_unregister(struct efivars *efivars); struct kobject *efivars_kobject(void); int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), - void *data, bool atomic, bool duplicates, - struct list_head *head); + void *data, bool duplicates, struct list_head *head); void efivar_entry_add(struct efivar_entry *entry, struct list_head *head); void efivar_entry_remove(struct efivar_entry *entry); @@ -1242,6 +1382,13 @@ int efivars_sysfs_init(void); #define EFIVARS_DATA_SIZE_MAX 1024 #endif /* CONFIG_EFI_VARS */ +extern bool efi_capsule_pending(int *reset_type); + +extern int efi_capsule_supported(efi_guid_t guid, u32 flags, + size_t size, int *reset); + +extern int efi_capsule_update(efi_capsule_header_t *capsule, + struct page **pages); #ifdef CONFIG_EFI_RUNTIME_MAP int efi_runtime_map_init(struct kobject *); @@ -1319,5 +1466,9 @@ efi_status_t handle_cmdline_files(efi_system_table_t *sys_table_arg, efi_status_t efi_parse_options(char *cmdline); +efi_status_t efi_setup_gop(efi_system_table_t *sys_table_arg, + struct screen_info *si, efi_guid_t *proto, + unsigned long size); + bool efi_runtime_disabled(void); #endif /* _LINUX_EFI_H */ diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index c06c44242f39..30f089ebe0a4 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -152,6 +152,8 @@ struct kernfs_syscall_ops { int (*rmdir)(struct kernfs_node *kn); int (*rename)(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name); + int (*show_path)(struct seq_file *sf, struct kernfs_node *kn, + struct kernfs_root *root); }; struct kernfs_root { diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index d10ef06971b5..eabe0138eb06 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -356,8 +356,13 @@ extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask); extern void lockdep_clear_current_reclaim_state(void); extern void lockdep_trace_alloc(gfp_t mask); -extern void lock_pin_lock(struct lockdep_map *lock); -extern void lock_unpin_lock(struct lockdep_map *lock); +struct pin_cookie { unsigned int val; }; + +#define NIL_COOKIE (struct pin_cookie){ .val = 0U, } + +extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock); +extern void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie); +extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); # define INIT_LOCKDEP .lockdep_recursion = 0, .lockdep_reclaim_gfp = 0, @@ -373,8 +378,9 @@ extern void lock_unpin_lock(struct lockdep_map *lock); #define lockdep_recursing(tsk) ((tsk)->lockdep_recursion) -#define lockdep_pin_lock(l) lock_pin_lock(&(l)->dep_map) -#define lockdep_unpin_lock(l) lock_unpin_lock(&(l)->dep_map) +#define lockdep_pin_lock(l) lock_pin_lock(&(l)->dep_map) +#define lockdep_repin_lock(l,c) lock_repin_lock(&(l)->dep_map, (c)) +#define lockdep_unpin_lock(l,c) lock_unpin_lock(&(l)->dep_map, (c)) #else /* !CONFIG_LOCKDEP */ @@ -427,8 +433,13 @@ struct lock_class_key { }; #define lockdep_recursing(tsk) (0) -#define lockdep_pin_lock(l) do { (void)(l); } while (0) -#define lockdep_unpin_lock(l) do { (void)(l); } while (0) +struct pin_cookie { }; + +#define NIL_COOKIE (struct pin_cookie){ } + +#define lockdep_pin_lock(l) ({ struct pin_cookie cookie; cookie; }) +#define lockdep_repin_lock(l, c) do { (void)(l); (void)(c); } while (0) +#define lockdep_unpin_lock(l, c) do { (void)(l); (void)(c); } while (0) #endif /* !LOCKDEP */ @@ -446,6 +457,18 @@ do { \ lock_acquired(&(_lock)->dep_map, _RET_IP_); \ } while (0) +#define LOCK_CONTENDED_RETURN(_lock, try, lock) \ +({ \ + int ____err = 0; \ + if (!try(_lock)) { \ + lock_contended(&(_lock)->dep_map, _RET_IP_); \ + ____err = lock(_lock); \ + } \ + if (!____err) \ + lock_acquired(&(_lock)->dep_map, _RET_IP_); \ + ____err; \ +}) + #else /* CONFIG_LOCK_STAT */ #define lock_contended(lockdep_map, ip) do {} while (0) @@ -454,6 +477,9 @@ do { \ #define LOCK_CONTENDED(_lock, try, lock) \ lock(_lock) +#define LOCK_CONTENDED_RETURN(_lock, try, lock) \ + lock(_lock) + #endif /* CONFIG_LOCK_STAT */ #ifdef CONFIG_LOCKDEP diff --git a/include/linux/mfd/samsung/s2mps11.h b/include/linux/mfd/samsung/s2mps11.h index b288965e8101..2c14eeca46f0 100644 --- a/include/linux/mfd/samsung/s2mps11.h +++ b/include/linux/mfd/samsung/s2mps11.h @@ -173,10 +173,12 @@ enum s2mps11_regulators { #define S2MPS11_LDO_VSEL_MASK 0x3F #define S2MPS11_BUCK_VSEL_MASK 0xFF +#define S2MPS11_BUCK9_VSEL_MASK 0x1F #define S2MPS11_ENABLE_MASK (0x03 << S2MPS11_ENABLE_SHIFT) #define S2MPS11_ENABLE_SHIFT 0x06 #define S2MPS11_LDO_N_VOLTAGES (S2MPS11_LDO_VSEL_MASK + 1) #define S2MPS11_BUCK_N_VOLTAGES (S2MPS11_BUCK_VSEL_MASK + 1) +#define S2MPS11_BUCK9_N_VOLTAGES (S2MPS11_BUCK9_VSEL_MASK + 1) #define S2MPS11_RAMP_DELAY 25000 /* uV/us */ #define S2MPS11_CTRL1_PWRHOLD_MASK BIT(4) diff --git a/include/linux/mm.h b/include/linux/mm.h index 864d7221de84..8f468e0d2534 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -500,11 +500,20 @@ static inline int page_mapcount(struct page *page) #ifdef CONFIG_TRANSPARENT_HUGEPAGE int total_mapcount(struct page *page); +int page_trans_huge_mapcount(struct page *page, int *total_mapcount); #else static inline int total_mapcount(struct page *page) { return page_mapcount(page); } +static inline int page_trans_huge_mapcount(struct page *page, + int *total_mapcount) +{ + int mapcount = page_mapcount(page); + if (total_mapcount) + *total_mapcount = mapcount; + return mapcount; +} #endif static inline struct page *virt_to_head_page(const void *x) diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h index 70fffeba7495..a4441784503b 100644 --- a/include/linux/mmu_context.h +++ b/include/linux/mmu_context.h @@ -1,9 +1,16 @@ #ifndef _LINUX_MMU_CONTEXT_H #define _LINUX_MMU_CONTEXT_H +#include <asm/mmu_context.h> + struct mm_struct; void use_mm(struct mm_struct *mm); void unuse_mm(struct mm_struct *mm); +/* Architectures that care about IRQ state in switch_mm can override this. */ +#ifndef switch_mm_irqs_off +# define switch_mm_irqs_off switch_mm +#endif + #endif diff --git a/include/linux/namei.h b/include/linux/namei.h index 77d01700daf7..ec5ec2818a28 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -79,6 +79,8 @@ extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int); extern struct dentry *lookup_one_len(const char *, struct dentry *, int); extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int); +struct qstr; +extern struct dentry *lookup_hash(const struct qstr *, struct dentry *); extern int follow_down_one(struct path *); extern int follow_down(struct path *); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f291275ffd71..9e1c3ada91c4 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -58,7 +58,7 @@ struct perf_guest_info_callbacks { struct perf_callchain_entry { __u64 nr; - __u64 ip[PERF_MAX_STACK_DEPTH]; + __u64 ip[0]; /* /proc/sys/kernel/perf_event_max_stack */ }; struct perf_raw_record { @@ -151,6 +151,15 @@ struct hw_perf_event { */ struct task_struct *target; + /* + * PMU would store hardware filter configuration + * here. + */ + void *addr_filters; + + /* Last sync'ed generation of filters */ + unsigned long addr_filters_gen; + /* * hw_perf_event::state flags; used to track the PERF_EF_* state. */ @@ -216,6 +225,7 @@ struct perf_event; #define PERF_PMU_CAP_AUX_SW_DOUBLEBUF 0x08 #define PERF_PMU_CAP_EXCLUSIVE 0x10 #define PERF_PMU_CAP_ITRACE 0x20 +#define PERF_PMU_CAP_HETEROGENEOUS_CPUS 0x40 /** * struct pmu - generic performance monitoring unit @@ -240,6 +250,9 @@ struct pmu { int task_ctx_nr; int hrtimer_interval_ms; + /* number of address filters this PMU can do */ + unsigned int nr_addr_filters; + /* * Fully disable/enable this PMU, can be used to protect from the PMI * as well as for lazy/batch writing of the MSRs. @@ -393,12 +406,71 @@ struct pmu { void (*free_aux) (void *aux); /* optional */ /* + * Validate address range filters: make sure the HW supports the + * requested configuration and number of filters; return 0 if the + * supplied filters are valid, -errno otherwise. + * + * Runs in the context of the ioctl()ing process and is not serialized + * with the rest of the PMU callbacks. + */ + int (*addr_filters_validate) (struct list_head *filters); + /* optional */ + + /* + * Synchronize address range filter configuration: + * translate hw-agnostic filters into hardware configuration in + * event::hw::addr_filters. + * + * Runs as a part of filter sync sequence that is done in ->start() + * callback by calling perf_event_addr_filters_sync(). + * + * May (and should) traverse event::addr_filters::list, for which its + * caller provides necessary serialization. + */ + void (*addr_filters_sync) (struct perf_event *event); + /* optional */ + + /* * Filter events for PMU-specific reasons. */ int (*filter_match) (struct perf_event *event); /* optional */ }; /** + * struct perf_addr_filter - address range filter definition + * @entry: event's filter list linkage + * @inode: object file's inode for file-based filters + * @offset: filter range offset + * @size: filter range size + * @range: 1: range, 0: address + * @filter: 1: filter/start, 0: stop + * + * This is a hardware-agnostic filter configuration as specified by the user. + */ +struct perf_addr_filter { + struct list_head entry; + struct inode *inode; + unsigned long offset; + unsigned long size; + unsigned int range : 1, + filter : 1; +}; + +/** + * struct perf_addr_filters_head - container for address range filters + * @list: list of filters for this event + * @lock: spinlock that serializes accesses to the @list and event's + * (and its children's) filter generations. + * + * A child event will use parent's @list (and therefore @lock), so they are + * bundled together; see perf_event_addr_filters(). + */ +struct perf_addr_filters_head { + struct list_head list; + raw_spinlock_t lock; +}; + +/** * enum perf_event_active_state - the states of a event */ enum perf_event_active_state { @@ -566,6 +638,12 @@ struct perf_event { atomic_t event_limit; + /* address range filters */ + struct perf_addr_filters_head addr_filters; + /* vma address array for file-based filders */ + unsigned long *addr_filters_offs; + unsigned long addr_filters_gen; + void (*destroy)(struct perf_event *); struct rcu_head rcu_head; @@ -834,9 +912,25 @@ extern int perf_event_overflow(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); +extern void perf_event_output_forward(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs); +extern void perf_event_output_backward(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs); extern void perf_event_output(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs); + struct perf_sample_data *data, + struct pt_regs *regs); + +static inline bool +is_default_overflow_handler(struct perf_event *event) +{ + if (likely(event->overflow_handler == perf_event_output_forward)) + return true; + if (unlikely(event->overflow_handler == perf_event_output_backward)) + return true; + return false; +} extern void perf_event_header__init_id(struct perf_event_header *header, @@ -977,9 +1071,11 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, extern int get_callchain_buffers(void); extern void put_callchain_buffers(void); +extern int sysctl_perf_event_max_stack; + static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip) { - if (entry->nr < PERF_MAX_STACK_DEPTH) { + if (entry->nr < sysctl_perf_event_max_stack) { entry->ip[entry->nr++] = ip; return 0; } else { @@ -1001,6 +1097,8 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +int perf_event_max_stack_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); static inline bool perf_paranoid_tracepoint_raw(void) { @@ -1045,8 +1143,41 @@ static inline bool has_aux(struct perf_event *event) return event->pmu->setup_aux; } +static inline bool is_write_backward(struct perf_event *event) +{ + return !!event->attr.write_backward; +} + +static inline bool has_addr_filter(struct perf_event *event) +{ + return event->pmu->nr_addr_filters; +} + +/* + * An inherited event uses parent's filters + */ +static inline struct perf_addr_filters_head * +perf_event_addr_filters(struct perf_event *event) +{ + struct perf_addr_filters_head *ifh = &event->addr_filters; + + if (event->parent) + ifh = &event->parent->addr_filters; + + return ifh; +} + +extern void perf_event_addr_filters_sync(struct perf_event *event); + extern int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size); +extern int perf_output_begin_forward(struct perf_output_handle *handle, + struct perf_event *event, + unsigned int size); +extern int perf_output_begin_backward(struct perf_output_handle *handle, + struct perf_event *event, + unsigned int size); + extern void perf_output_end(struct perf_output_handle *handle); extern unsigned int perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len); diff --git a/include/linux/pnp.h b/include/linux/pnp.h index 5df733b8f704..2588ca6a9028 100644 --- a/include/linux/pnp.h +++ b/include/linux/pnp.h @@ -337,9 +337,11 @@ extern struct mutex pnp_res_mutex; #ifdef CONFIG_PNPBIOS extern struct pnp_protocol pnpbios_protocol; +extern bool arch_pnpbios_disabled(void); #define pnp_device_is_pnpbios(dev) ((dev)->protocol == (&pnpbios_protocol)) #else #define pnp_device_is_pnpbios(dev) 0 +#define arch_pnpbios_disabled() false #endif #ifdef CONFIG_PNPACPI diff --git a/include/linux/proportions.h b/include/linux/proportions.h deleted file mode 100644 index 21221338ad18..000000000000 --- a/include/linux/proportions.h +++ /dev/null @@ -1,137 +0,0 @@ -/* - * FLoating proportions - * - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra - * - * This file contains the public data structure and API definitions. - */ - -#ifndef _LINUX_PROPORTIONS_H -#define _LINUX_PROPORTIONS_H - -#include <linux/percpu_counter.h> -#include <linux/spinlock.h> -#include <linux/mutex.h> -#include <linux/gfp.h> - -struct prop_global { - /* - * The period over which we differentiate - * - * period = 2^shift - */ - int shift; - /* - * The total event counter aka 'time'. - * - * Treated as an unsigned long; the lower 'shift - 1' bits are the - * counter bits, the remaining upper bits the period counter. - */ - struct percpu_counter events; -}; - -/* - * global proportion descriptor - * - * this is needed to consistently flip prop_global structures. - */ -struct prop_descriptor { - int index; - struct prop_global pg[2]; - struct mutex mutex; /* serialize the prop_global switch */ -}; - -int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp); -void prop_change_shift(struct prop_descriptor *pd, int new_shift); - -/* - * ----- PERCPU ------ - */ - -struct prop_local_percpu { - /* - * the local events counter - */ - struct percpu_counter events; - - /* - * snapshot of the last seen global state - */ - int shift; - unsigned long period; - raw_spinlock_t lock; /* protect the snapshot state */ -}; - -int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp); -void prop_local_destroy_percpu(struct prop_local_percpu *pl); -void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl); -void prop_fraction_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl, - long *numerator, long *denominator); - -static inline -void prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl) -{ - unsigned long flags; - - local_irq_save(flags); - __prop_inc_percpu(pd, pl); - local_irq_restore(flags); -} - -/* - * Limit the time part in order to ensure there are some bits left for the - * cycle counter and fraction multiply. - */ -#if BITS_PER_LONG == 32 -#define PROP_MAX_SHIFT (3*BITS_PER_LONG/4) -#else -#define PROP_MAX_SHIFT (BITS_PER_LONG/2) -#endif - -#define PROP_FRAC_SHIFT (BITS_PER_LONG - PROP_MAX_SHIFT - 1) -#define PROP_FRAC_BASE (1UL << PROP_FRAC_SHIFT) - -void __prop_inc_percpu_max(struct prop_descriptor *pd, - struct prop_local_percpu *pl, long frac); - - -/* - * ----- SINGLE ------ - */ - -struct prop_local_single { - /* - * the local events counter - */ - unsigned long events; - - /* - * snapshot of the last seen global state - * and a lock protecting this state - */ - unsigned long period; - int shift; - raw_spinlock_t lock; /* protect the snapshot state */ -}; - -#define INIT_PROP_LOCAL_SINGLE(name) \ -{ .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ -} - -int prop_local_init_single(struct prop_local_single *pl); -void prop_local_destroy_single(struct prop_local_single *pl); -void __prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl); -void prop_fraction_single(struct prop_descriptor *pd, struct prop_local_single *pl, - long *numerator, long *denominator); - -static inline -void prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl) -{ - unsigned long flags; - - local_irq_save(flags); - __prop_inc_single(pd, pl); - local_irq_restore(flags); -} - -#endif /* _LINUX_PROPORTIONS_H */ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 2657aff2725b..5f1533e3d032 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -508,14 +508,7 @@ int rcu_read_lock_bh_held(void); * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side * critical section unless it can prove otherwise. */ -#ifdef CONFIG_PREEMPT_COUNT int rcu_read_lock_sched_held(void); -#else /* #ifdef CONFIG_PREEMPT_COUNT */ -static inline int rcu_read_lock_sched_held(void) -{ - return 1; -} -#endif /* #else #ifdef CONFIG_PREEMPT_COUNT */ #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ @@ -532,18 +525,10 @@ static inline int rcu_read_lock_bh_held(void) return 1; } -#ifdef CONFIG_PREEMPT_COUNT static inline int rcu_read_lock_sched_held(void) { - return preempt_count() != 0 || irqs_disabled(); -} -#else /* #ifdef CONFIG_PREEMPT_COUNT */ -static inline int rcu_read_lock_sched_held(void) -{ - return 1; + return !preemptible(); } -#endif /* #else #ifdef CONFIG_PREEMPT_COUNT */ - #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #ifdef CONFIG_PROVE_RCU @@ -1144,4 +1129,17 @@ static inline void rcu_sysidle_force_exit(void) #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ +/* + * Dump the ftrace buffer, but only one time per callsite per boot. + */ +#define rcu_ftrace_dump(oops_dump_mode) \ +do { \ + static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \ + \ + if (!atomic_read(&___rfd_beenhere) && \ + !atomic_xchg(&___rfd_beenhere, 1)) \ + ftrace_dump(oops_dump_mode); \ +} while (0) + + #endif /* __LINUX_RCUPDATE_H */ diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 64809aea661c..93aea75029fb 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -149,6 +149,22 @@ static inline unsigned long rcu_batches_completed_sched(void) return 0; } +/* + * Return the number of expedited grace periods completed. + */ +static inline unsigned long rcu_exp_batches_completed(void) +{ + return 0; +} + +/* + * Return the number of expedited sched grace periods completed. + */ +static inline unsigned long rcu_exp_batches_completed_sched(void) +{ + return 0; +} + static inline void rcu_force_quiescent_state(void) { } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index ad1eda9fa4da..5043cb823fb2 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -87,6 +87,8 @@ unsigned long rcu_batches_started_sched(void); unsigned long rcu_batches_completed(void); unsigned long rcu_batches_completed_bh(void); unsigned long rcu_batches_completed_sched(void); +unsigned long rcu_exp_batches_completed(void); +unsigned long rcu_exp_batches_completed_sched(void); void show_rcu_gp_kthreads(void); void rcu_force_quiescent_state(void); diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h index 561e8615528d..ae0528b834cd 100644 --- a/include/linux/rwsem-spinlock.h +++ b/include/linux/rwsem-spinlock.h @@ -34,7 +34,7 @@ struct rw_semaphore { extern void __down_read(struct rw_semaphore *sem); extern int __down_read_trylock(struct rw_semaphore *sem); extern void __down_write(struct rw_semaphore *sem); -extern void __down_write_nested(struct rw_semaphore *sem, int subclass); +extern int __must_check __down_write_killable(struct rw_semaphore *sem); extern int __down_write_trylock(struct rw_semaphore *sem); extern void __up_read(struct rw_semaphore *sem); extern void __up_write(struct rw_semaphore *sem); diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 8f498cdde280..d1c12d160ace 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -14,6 +14,7 @@ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/atomic.h> +#include <linux/err.h> #ifdef CONFIG_RWSEM_SPIN_ON_OWNER #include <linux/osq_lock.h> #endif @@ -43,6 +44,7 @@ struct rw_semaphore { extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *); extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); @@ -116,6 +118,7 @@ extern int down_read_trylock(struct rw_semaphore *sem); * lock for writing */ extern void down_write(struct rw_semaphore *sem); +extern int __must_check down_write_killable(struct rw_semaphore *sem); /* * trylock for writing -- returns 1 if successful, 0 if contention diff --git a/include/linux/sched.h b/include/linux/sched.h index 52c4847b05e2..6cc0df970f1a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -40,7 +40,6 @@ struct sched_param { #include <linux/pid.h> #include <linux/percpu.h> #include <linux/topology.h> -#include <linux/proportions.h> #include <linux/seccomp.h> #include <linux/rcupdate.h> #include <linux/rculist.h> @@ -178,9 +177,11 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load); extern void calc_global_load(unsigned long ticks); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) -extern void update_cpu_load_nohz(int active); +extern void cpu_load_update_nohz_start(void); +extern void cpu_load_update_nohz_stop(void); #else -static inline void update_cpu_load_nohz(int active) { } +static inline void cpu_load_update_nohz_start(void) { } +static inline void cpu_load_update_nohz_stop(void) { } #endif extern void dump_cpu_task(int cpu); @@ -372,6 +373,15 @@ extern void cpu_init (void); extern void trap_init(void); extern void update_process_times(int user); extern void scheduler_tick(void); +extern int sched_cpu_starting(unsigned int cpu); +extern int sched_cpu_activate(unsigned int cpu); +extern int sched_cpu_deactivate(unsigned int cpu); + +#ifdef CONFIG_HOTPLUG_CPU +extern int sched_cpu_dying(unsigned int cpu); +#else +# define sched_cpu_dying NULL +#endif extern void sched_show_task(struct task_struct *p); @@ -935,9 +945,19 @@ enum cpu_idle_type { }; /* + * Integer metrics need fixed point arithmetic, e.g., sched/fair + * has a few: load, load_avg, util_avg, freq, and capacity. + * + * We define a basic fixed point arithmetic range, and then formalize + * all these metrics based on that basic range. + */ +# define SCHED_FIXEDPOINT_SHIFT 10 +# define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT) + +/* * Increase resolution of cpu_capacity calculations */ -#define SCHED_CAPACITY_SHIFT 10 +#define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT #define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) /* @@ -1199,18 +1219,56 @@ struct load_weight { }; /* - * The load_avg/util_avg accumulates an infinite geometric series. - * 1) load_avg factors frequency scaling into the amount of time that a - * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the - * aggregated such weights of all runnable and blocked sched_entities. - * 2) util_avg factors frequency and cpu scaling into the amount of time - * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE]. - * For cfs_rq, it is the aggregated such times of all runnable and + * The load_avg/util_avg accumulates an infinite geometric series + * (see __update_load_avg() in kernel/sched/fair.c). + * + * [load_avg definition] + * + * load_avg = runnable% * scale_load_down(load) + * + * where runnable% is the time ratio that a sched_entity is runnable. + * For cfs_rq, it is the aggregated load_avg of all runnable and * blocked sched_entities. - * The 64 bit load_sum can: - * 1) for cfs_rq, afford 4353082796 (=2^64/47742/88761) entities with - * the highest weight (=88761) always runnable, we should not overflow - * 2) for entity, support any load.weight always runnable + * + * load_avg may also take frequency scaling into account: + * + * load_avg = runnable% * scale_load_down(load) * freq% + * + * where freq% is the CPU frequency normalized to the highest frequency. + * + * [util_avg definition] + * + * util_avg = running% * SCHED_CAPACITY_SCALE + * + * where running% is the time ratio that a sched_entity is running on + * a CPU. For cfs_rq, it is the aggregated util_avg of all runnable + * and blocked sched_entities. + * + * util_avg may also factor frequency scaling and CPU capacity scaling: + * + * util_avg = running% * SCHED_CAPACITY_SCALE * freq% * capacity% + * + * where freq% is the same as above, and capacity% is the CPU capacity + * normalized to the greatest capacity (due to uarch differences, etc). + * + * N.B., the above ratios (runnable%, running%, freq%, and capacity%) + * themselves are in the range of [0, 1]. To do fixed point arithmetics, + * we therefore scale them to as large a range as necessary. This is for + * example reflected by util_avg's SCHED_CAPACITY_SCALE. + * + * [Overflow issue] + * + * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities + * with the highest load (=88761), always runnable on a single cfs_rq, + * and should not overflow as the number already hits PID_MAX_LIMIT. + * + * For all other cases (including 32-bit kernels), struct load_weight's + * weight will overflow first before we do, because: + * + * Max(load_avg) <= Max(load.weight) + * + * Then it is the load_weight's responsibility to consider overflow + * issues. */ struct sched_avg { u64 last_update_time, load_sum; @@ -1596,6 +1654,7 @@ struct task_struct { unsigned long sas_ss_sp; size_t sas_ss_size; + unsigned sas_ss_flags; struct callback_head *task_works; @@ -1871,6 +1930,11 @@ extern int arch_task_struct_size __read_mostly; /* Future-safe accessor for struct task_struct's cpus_allowed. */ #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) +static inline int tsk_nr_cpus_allowed(struct task_struct *p) +{ + return p->nr_cpus_allowed; +} + #define TNF_MIGRATED 0x01 #define TNF_NO_GROUP 0x02 #define TNF_SHARED 0x04 @@ -2303,8 +2367,6 @@ extern unsigned long long notrace sched_clock(void); /* * See the comment in kernel/sched/clock.c */ -extern u64 cpu_clock(int cpu); -extern u64 local_clock(void); extern u64 running_clock(void); extern u64 sched_clock_cpu(int cpu); @@ -2323,6 +2385,16 @@ static inline void sched_clock_idle_sleep_event(void) static inline void sched_clock_idle_wakeup_event(u64 delta_ns) { } + +static inline u64 cpu_clock(int cpu) +{ + return sched_clock(); +} + +static inline u64 local_clock(void) +{ + return sched_clock(); +} #else /* * Architectures can set this to 1 if they have specified @@ -2337,6 +2409,26 @@ extern void clear_sched_clock_stable(void); extern void sched_clock_tick(void); extern void sched_clock_idle_sleep_event(void); extern void sched_clock_idle_wakeup_event(u64 delta_ns); + +/* + * As outlined in clock.c, provides a fast, high resolution, nanosecond + * time source that is monotonic per cpu argument and has bounded drift + * between cpus. + * + * ######################### BIG FAT WARNING ########################## + * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # + * # go backwards !! # + * #################################################################### + */ +static inline u64 cpu_clock(int cpu) +{ + return sched_clock_cpu(cpu); +} + +static inline u64 local_clock(void) +{ + return sched_clock_cpu(raw_smp_processor_id()); +} #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -2575,6 +2667,18 @@ static inline int kill_cad_pid(int sig, int priv) */ static inline int on_sig_stack(unsigned long sp) { + /* + * If the signal stack is SS_AUTODISARM then, by construction, we + * can't be on the signal stack unless user code deliberately set + * SS_AUTODISARM when we were already on it. + * + * This improves reliability: if user state gets corrupted such that + * the stack pointer points very close to the end of the signal stack, + * then this check will enable the signal to be handled anyway. + */ + if (current->sas_ss_flags & SS_AUTODISARM) + return 0; + #ifdef CONFIG_STACK_GROWSUP return sp >= current->sas_ss_sp && sp - current->sas_ss_sp < current->sas_ss_size; @@ -2592,6 +2696,13 @@ static inline int sas_ss_flags(unsigned long sp) return on_sig_stack(sp) ? SS_ONSTACK : 0; } +static inline void sas_ss_reset(struct task_struct *p) +{ + p->sas_ss_sp = 0; + p->sas_ss_size = 0; + p->sas_ss_flags = SS_DISABLE; +} + static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig) { if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp)) diff --git a/include/linux/signal.h b/include/linux/signal.h index 92557bbce7e7..3fbe81444d31 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -432,8 +432,10 @@ int __save_altstack(stack_t __user *, unsigned long); stack_t __user *__uss = uss; \ struct task_struct *t = current; \ put_user_ex((void __user *)t->sas_ss_sp, &__uss->ss_sp); \ - put_user_ex(sas_ss_flags(sp), &__uss->ss_flags); \ + put_user_ex(t->sas_ss_flags, &__uss->ss_flags); \ put_user_ex(t->sas_ss_size, &__uss->ss_size); \ + if (t->sas_ss_flags & SS_AUTODISARM) \ + sas_ss_reset(t); \ } while (0); #ifdef CONFIG_PROC_FS diff --git a/include/linux/swap.h b/include/linux/swap.h index 0a4cd4703f40..ad220359f1b0 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -418,7 +418,7 @@ extern sector_t swapdev_block(int, pgoff_t); extern int page_swapcount(struct page *); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); -extern int reuse_swap_page(struct page *); +extern bool reuse_swap_page(struct page *, int *); extern int try_to_free_swap(struct page *); struct backing_dev_info; @@ -513,8 +513,8 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -#define reuse_swap_page(page) \ - (!PageTransCompound(page) && page_mapcount(page) == 1) +#define reuse_swap_page(page, total_mapcount) \ + (page_trans_huge_mapcount(page, total_mapcount) == 1) static inline int try_to_free_swap(struct page *page) { diff --git a/include/linux/uio.h b/include/linux/uio.h index fd9bcfedad42..1b5d1cd796e2 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -87,6 +87,7 @@ size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i); size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i); size_t iov_iter_zero(size_t bytes, struct iov_iter *); unsigned long iov_iter_alignment(const struct iov_iter *i); +unsigned long iov_iter_gap_alignment(const struct iov_iter *i); void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov, unsigned long nr_segs, size_t count); void iov_iter_kvec(struct iov_iter *i, int direction, const struct kvec *kvec, diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index ef72c4aada56..d3e756539d44 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -172,6 +172,77 @@ TRACE_EVENT(rcu_grace_period_init, ); /* + * Tracepoint for expedited grace-period events. Takes a string identifying + * the RCU flavor, the expedited grace-period sequence number, and a string + * identifying the grace-period-related event as follows: + * + * "snap": Captured snapshot of expedited grace period sequence number. + * "start": Started a real expedited grace period. + * "end": Ended a real expedited grace period. + * "endwake": Woke piggybackers up. + * "done": Someone else did the expedited grace period for us. + */ +TRACE_EVENT(rcu_exp_grace_period, + + TP_PROTO(const char *rcuname, unsigned long gpseq, const char *gpevent), + + TP_ARGS(rcuname, gpseq, gpevent), + + TP_STRUCT__entry( + __field(const char *, rcuname) + __field(unsigned long, gpseq) + __field(const char *, gpevent) + ), + + TP_fast_assign( + __entry->rcuname = rcuname; + __entry->gpseq = gpseq; + __entry->gpevent = gpevent; + ), + + TP_printk("%s %lu %s", + __entry->rcuname, __entry->gpseq, __entry->gpevent) +); + +/* + * Tracepoint for expedited grace-period funnel-locking events. Takes a + * string identifying the RCU flavor, an integer identifying the rcu_node + * combining-tree level, another pair of integers identifying the lowest- + * and highest-numbered CPU associated with the current rcu_node structure, + * and a string. identifying the grace-period-related event as follows: + * + * "nxtlvl": Advance to next level of rcu_node funnel + * "wait": Wait for someone else to do expedited GP + */ +TRACE_EVENT(rcu_exp_funnel_lock, + + TP_PROTO(const char *rcuname, u8 level, int grplo, int grphi, + const char *gpevent), + + TP_ARGS(rcuname, level, grplo, grphi, gpevent), + + TP_STRUCT__entry( + __field(const char *, rcuname) + __field(u8, level) + __field(int, grplo) + __field(int, grphi) + __field(const char *, gpevent) + ), + + TP_fast_assign( + __entry->rcuname = rcuname; + __entry->level = level; + __entry->grplo = grplo; + __entry->grphi = grphi; + __entry->gpevent = gpevent; + ), + + TP_printk("%s %d %d %d %s", + __entry->rcuname, __entry->level, __entry->grplo, + __entry->grphi, __entry->gpevent) +); + +/* * Tracepoint for RCU no-CBs CPU callback handoffs. This event is intended * to assist debugging of these handoffs. * @@ -704,11 +775,15 @@ TRACE_EVENT(rcu_barrier, #else /* #ifdef CONFIG_RCU_TRACE */ #define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0) -#define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, \ - qsmask) do { } while (0) #define trace_rcu_future_grace_period(rcuname, gpnum, completed, c, \ level, grplo, grphi, event) \ do { } while (0) +#define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, \ + qsmask) do { } while (0) +#define trace_rcu_exp_grace_period(rcuname, gqseq, gpevent) \ + do { } while (0) +#define trace_rcu_exp_funnel_lock(rcuname, level, grplo, grphi, gpevent) \ + do { } while (0) #define trace_rcu_nocb_wake(rcuname, cpu, reason) do { } while (0) #define trace_rcu_preempt_task(rcuname, pid, gpnum) do { } while (0) #define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0) diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 1afe9623c1a7..43fc8d213472 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -340,7 +340,8 @@ struct perf_event_attr { comm_exec : 1, /* flag comm events that are due to an exec */ use_clockid : 1, /* use @clockid for time fields */ context_switch : 1, /* context switch data */ - __reserved_1 : 37; + write_backward : 1, /* Write ring buffer from end to beginning */ + __reserved_1 : 36; union { __u32 wakeup_events; /* wakeup every n events */ @@ -401,6 +402,7 @@ struct perf_event_attr { #define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *) #define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) #define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32) +#define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32) enum perf_event_ioc_flags { PERF_IOC_FLAG_GROUP = 1U << 0, diff --git a/include/uapi/linux/signal.h b/include/uapi/linux/signal.h index e1bd50c29ded..cd0804b6bfa2 100644 --- a/include/uapi/linux/signal.h +++ b/include/uapi/linux/signal.h @@ -7,4 +7,9 @@ #define SS_ONSTACK 1 #define SS_DISABLE 2 +/* bit-flags */ +#define SS_AUTODISARM (1U << 31) /* disable sas during sighandling */ +/* mask for all SS_xxx flags */ +#define SS_FLAG_BITS SS_AUTODISARM + #endif /* _UAPI_LINUX_SIGNAL_H */ diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 499d9e933f8e..f5a19548be12 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -66,7 +66,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || value_size < 8 || value_size % 8 || - value_size / 8 > PERF_MAX_STACK_DEPTH) + value_size / 8 > sysctl_perf_event_max_stack) return ERR_PTR(-EINVAL); /* hash table size must be power of 2 */ @@ -124,8 +124,8 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) struct perf_callchain_entry *trace; struct stack_map_bucket *bucket, *new_bucket, *old_bucket; u32 max_depth = map->value_size / 8; - /* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */ - u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth; + /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ + u32 init_nr = sysctl_perf_event_max_stack - max_depth; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; u32 hash, id, trace_nr, trace_len; bool user = flags & BPF_F_USER_STACK; @@ -143,7 +143,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) return -EFAULT; /* get_perf_callchain() guarantees that trace->nr >= init_nr - * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth + * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth */ trace_nr = trace->nr - init_nr; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 909a7d31ffd3..86cb5c6e8932 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1215,6 +1215,41 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_free_root(root); } +/* + * look up cgroup associated with current task's cgroup namespace on the + * specified hierarchy + */ +static struct cgroup * +current_cgns_cgroup_from_root(struct cgroup_root *root) +{ + struct cgroup *res = NULL; + struct css_set *cset; + + lockdep_assert_held(&css_set_lock); + + rcu_read_lock(); + + cset = current->nsproxy->cgroup_ns->root_cset; + if (cset == &init_css_set) { + res = &root->cgrp; + } else { + struct cgrp_cset_link *link; + + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { + struct cgroup *c = link->cgrp; + + if (c->root == root) { + res = c; + break; + } + } + } + rcu_read_unlock(); + + BUG_ON(!res); + return res; +} + /* look up cgroup associated with given css_set on the specified hierarchy */ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) @@ -1593,6 +1628,33 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) return 0; } +static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, + struct kernfs_root *kf_root) +{ + int len = 0; + char *buf = NULL; + struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root); + struct cgroup *ns_cgroup; + + buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + spin_lock_bh(&css_set_lock); + ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot); + len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX); + spin_unlock_bh(&css_set_lock); + + if (len >= PATH_MAX) + len = -ERANGE; + else if (len > 0) { + seq_escape(sf, buf, " \t\n\\"); + len = 0; + } + kfree(buf); + return len; +} + static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root) { @@ -5433,6 +5495,7 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .rename = cgroup_rename, + .show_path = cgroup_show_path, }; static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) diff --git a/kernel/cpu.c b/kernel/cpu.c index 3e3f6e49eabb..d948e44c471e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -703,21 +703,6 @@ static int takedown_cpu(unsigned int cpu) struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int err; - /* - * By now we've cleared cpu_active_mask, wait for all preempt-disabled - * and RCU users of this state to go away such that all new such users - * will observe it. - * - * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might - * not imply sync_sched(), so wait for both. - * - * Do sync before park smpboot threads to take care the rcu boost case. - */ - if (IS_ENABLED(CONFIG_PREEMPT)) - synchronize_rcu_mult(call_rcu, call_rcu_sched); - else - synchronize_rcu(); - /* Park the smpboot threads */ kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread); smpboot_park_threads(cpu); @@ -923,8 +908,6 @@ void cpuhp_online_idle(enum cpuhp_state state) st->state = CPUHP_AP_ONLINE_IDLE; - /* The cpu is marked online, set it active now */ - set_cpu_active(cpu, true); /* Unpark the stopper thread and the hotplug thread of this cpu */ stop_machine_unpark(cpu); kthread_unpark(st->thread); @@ -1236,6 +1219,12 @@ static struct cpuhp_step cpuhp_ap_states[] = { .name = "ap:offline", .cant_stop = true, }, + /* First state is scheduler control. Interrupts are disabled */ + [CPUHP_AP_SCHED_STARTING] = { + .name = "sched:starting", + .startup = sched_cpu_starting, + .teardown = sched_cpu_dying, + }, /* * Low level startup/teardown notifiers. Run with interrupts * disabled. Will be removed once the notifiers are converted to @@ -1274,6 +1263,15 @@ static struct cpuhp_step cpuhp_ap_states[] = { * The dynamically registered state space is here */ +#ifdef CONFIG_SMP + /* Last state is scheduler control setting the cpu active */ + [CPUHP_AP_ACTIVE] = { + .name = "sched:active", + .startup = sched_cpu_activate, + .teardown = sched_cpu_deactivate, + }, +#endif + /* CPU is fully up and running. */ [CPUHP_ONLINE] = { .name = "online", diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 343c22f5e867..b9325e7dcba1 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -18,6 +18,14 @@ struct callchain_cpus_entries { struct perf_callchain_entry *cpu_entries[0]; }; +int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH; + +static inline size_t perf_callchain_entry__sizeof(void) +{ + return (sizeof(struct perf_callchain_entry) + + sizeof(__u64) * sysctl_perf_event_max_stack); +} + static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); static atomic_t nr_callchain_events; static DEFINE_MUTEX(callchain_mutex); @@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void) if (!entries) return -ENOMEM; - size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; + size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS; for_each_possible_cpu(cpu) { entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, @@ -147,7 +155,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx) cpu = smp_processor_id(); - return &entries->cpu_entries[cpu][*rctx]; + return (((void *)entries->cpu_entries[cpu]) + + (*rctx * perf_callchain_entry__sizeof())); } static void @@ -215,3 +224,25 @@ exit_put: return entry; } + +int perf_event_max_stack_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int new_value = sysctl_perf_event_max_stack, ret; + struct ctl_table new_table = *table; + + new_table.data = &new_value; + ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + mutex_lock(&callchain_mutex); + if (atomic_read(&nr_callchain_events)) + ret = -EBUSY; + else + sysctl_perf_event_max_stack = new_value; + + mutex_unlock(&callchain_mutex); + + return ret; +} diff --git a/kernel/events/core.c b/kernel/events/core.c index c0ded2416615..050a290c72c7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -44,6 +44,8 @@ #include <linux/compat.h> #include <linux/bpf.h> #include <linux/filter.h> +#include <linux/namei.h> +#include <linux/parser.h> #include "internal.h" @@ -1927,8 +1929,13 @@ event_sched_in(struct perf_event *event, if (event->state <= PERF_EVENT_STATE_OFF) return 0; - event->state = PERF_EVENT_STATE_ACTIVE; - event->oncpu = smp_processor_id(); + WRITE_ONCE(event->oncpu, smp_processor_id()); + /* + * Order event::oncpu write to happen before the ACTIVE state + * is visible. + */ + smp_wmb(); + WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE); /* * Unthrottle events, since we scheduled we might have missed several @@ -2360,6 +2367,112 @@ void perf_event_enable(struct perf_event *event) } EXPORT_SYMBOL_GPL(perf_event_enable); +struct stop_event_data { + struct perf_event *event; + unsigned int restart; +}; + +static int __perf_event_stop(void *info) +{ + struct stop_event_data *sd = info; + struct perf_event *event = sd->event; + + /* if it's already INACTIVE, do nothing */ + if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) + return 0; + + /* matches smp_wmb() in event_sched_in() */ + smp_rmb(); + + /* + * There is a window with interrupts enabled before we get here, + * so we need to check again lest we try to stop another CPU's event. + */ + if (READ_ONCE(event->oncpu) != smp_processor_id()) + return -EAGAIN; + + event->pmu->stop(event, PERF_EF_UPDATE); + + /* + * May race with the actual stop (through perf_pmu_output_stop()), + * but it is only used for events with AUX ring buffer, and such + * events will refuse to restart because of rb::aux_mmap_count==0, + * see comments in perf_aux_output_begin(). + * + * Since this is happening on a event-local CPU, no trace is lost + * while restarting. + */ + if (sd->restart) + event->pmu->start(event, PERF_EF_START); + + return 0; +} + +static int perf_event_restart(struct perf_event *event) +{ + struct stop_event_data sd = { + .event = event, + .restart = 1, + }; + int ret = 0; + + do { + if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) + return 0; + + /* matches smp_wmb() in event_sched_in() */ + smp_rmb(); + + /* + * We only want to restart ACTIVE events, so if the event goes + * inactive here (event->oncpu==-1), there's nothing more to do; + * fall through with ret==-ENXIO. + */ + ret = cpu_function_call(READ_ONCE(event->oncpu), + __perf_event_stop, &sd); + } while (ret == -EAGAIN); + + return ret; +} + +/* + * In order to contain the amount of racy and tricky in the address filter + * configuration management, it is a two part process: + * + * (p1) when userspace mappings change as a result of (1) or (2) or (3) below, + * we update the addresses of corresponding vmas in + * event::addr_filters_offs array and bump the event::addr_filters_gen; + * (p2) when an event is scheduled in (pmu::add), it calls + * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync() + * if the generation has changed since the previous call. + * + * If (p1) happens while the event is active, we restart it to force (p2). + * + * (1) perf_addr_filters_apply(): adjusting filters' offsets based on + * pre-existing mappings, called once when new filters arrive via SET_FILTER + * ioctl; + * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly + * registered mapping, called for every new mmap(), with mm::mmap_sem down + * for reading; + * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process + * of exec. + */ +void perf_event_addr_filters_sync(struct perf_event *event) +{ + struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); + + if (!has_addr_filter(event)) + return; + + raw_spin_lock(&ifh->lock); + if (event->addr_filters_gen != event->hw.addr_filters_gen) { + event->pmu->addr_filters_sync(event); + event->hw.addr_filters_gen = event->addr_filters_gen; + } + raw_spin_unlock(&ifh->lock); +} +EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync); + static int _perf_event_refresh(struct perf_event *event, int refresh) { /* @@ -3209,16 +3322,6 @@ out: put_ctx(clone_ctx); } -void perf_event_exec(void) -{ - int ctxn; - - rcu_read_lock(); - for_each_task_context_nr(ctxn) - perf_event_enable_on_exec(ctxn); - rcu_read_unlock(); -} - struct perf_read_data { struct perf_event *event; bool group; @@ -3720,6 +3823,9 @@ static bool exclusive_event_installable(struct perf_event *event, return true; } +static void perf_addr_filters_splice(struct perf_event *event, + struct list_head *head); + static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending); @@ -3747,6 +3853,8 @@ static void _free_event(struct perf_event *event) } perf_event_free_bpf_prog(event); + perf_addr_filters_splice(event, NULL); + kfree(event->addr_filters_offs); if (event->destroy) event->destroy(event); @@ -4343,6 +4451,19 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon case PERF_EVENT_IOC_SET_BPF: return perf_event_set_bpf_prog(event, arg); + case PERF_EVENT_IOC_PAUSE_OUTPUT: { + struct ring_buffer *rb; + + rcu_read_lock(); + rb = rcu_dereference(event->rb); + if (!rb || !rb->nr_pages) { + rcu_read_unlock(); + return -EINVAL; + } + rb_toggle_paused(rb, !!arg); + rcu_read_unlock(); + return 0; + } default: return -ENOTTY; } @@ -4659,6 +4780,8 @@ static void perf_mmap_open(struct vm_area_struct *vma) event->pmu->event_mapped(event); } +static void perf_pmu_output_stop(struct perf_event *event); + /* * A buffer can be mmap()ed multiple times; either directly through the same * event, or through other events by use of perf_event_set_output(). @@ -4686,10 +4809,22 @@ static void perf_mmap_close(struct vm_area_struct *vma) */ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) { + /* + * Stop all AUX events that are writing to this buffer, + * so that we can free its AUX pages and corresponding PMU + * data. Note that after rb::aux_mmap_count dropped to zero, + * they won't start any more (see perf_aux_output_begin()). + */ + perf_pmu_output_stop(event); + + /* now it's safe to free the pages */ atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); vma->vm_mm->pinned_vm -= rb->aux_mmap_locked; + /* this has to be the last one */ rb_free_aux(rb); + WARN_ON_ONCE(atomic_read(&rb->aux_refcount)); + mutex_unlock(&event->mmap_mutex); } @@ -5630,9 +5765,13 @@ void perf_prepare_sample(struct perf_event_header *header, } } -void perf_event_output(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) +static void __always_inline +__perf_event_output(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs, + int (*output_begin)(struct perf_output_handle *, + struct perf_event *, + unsigned int)) { struct perf_output_handle handle; struct perf_event_header header; @@ -5642,7 +5781,7 @@ void perf_event_output(struct perf_event *event, perf_prepare_sample(&header, data, event, regs); - if (perf_output_begin(&handle, event, header.size)) + if (output_begin(&handle, event, header.size)) goto exit; perf_output_sample(&handle, &header, data, event); @@ -5653,6 +5792,30 @@ exit: rcu_read_unlock(); } +void +perf_event_output_forward(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + __perf_event_output(event, data, regs, perf_output_begin_forward); +} + +void +perf_event_output_backward(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + __perf_event_output(event, data, regs, perf_output_begin_backward); +} + +void +perf_event_output(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + __perf_event_output(event, data, regs, perf_output_begin); +} + /* * read event_id */ @@ -5698,15 +5861,18 @@ typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); static void perf_event_aux_ctx(struct perf_event_context *ctx, perf_event_aux_output_cb output, - void *data) + void *data, bool all) { struct perf_event *event; list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (event->state < PERF_EVENT_STATE_INACTIVE) - continue; - if (!event_filter_match(event)) - continue; + if (!all) { + if (event->state < PERF_EVENT_STATE_INACTIVE) + continue; + if (!event_filter_match(event)) + continue; + } + output(event, data); } } @@ -5717,7 +5883,7 @@ perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data, { rcu_read_lock(); preempt_disable(); - perf_event_aux_ctx(task_ctx, output, data); + perf_event_aux_ctx(task_ctx, output, data, false); preempt_enable(); rcu_read_unlock(); } @@ -5747,13 +5913,13 @@ perf_event_aux(perf_event_aux_output_cb output, void *data, cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); if (cpuctx->unique_pmu != pmu) goto next; - perf_event_aux_ctx(&cpuctx->ctx, output, data); + perf_event_aux_ctx(&cpuctx->ctx, output, data, false); ctxn = pmu->task_ctx_nr; if (ctxn < 0) goto next; ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); if (ctx) - perf_event_aux_ctx(ctx, output, data); + perf_event_aux_ctx(ctx, output, data, false); next: put_cpu_ptr(pmu->pmu_cpu_context); } @@ -5761,6 +5927,134 @@ next: } /* + * Clear all file-based filters at exec, they'll have to be + * re-instated when/if these objects are mmapped again. + */ +static void perf_event_addr_filters_exec(struct perf_event *event, void *data) +{ + struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); + struct perf_addr_filter *filter; + unsigned int restart = 0, count = 0; + unsigned long flags; + + if (!has_addr_filter(event)) + return; + + raw_spin_lock_irqsave(&ifh->lock, flags); + list_for_each_entry(filter, &ifh->list, entry) { + if (filter->inode) { + event->addr_filters_offs[count] = 0; + restart++; + } + + count++; + } + + if (restart) + event->addr_filters_gen++; + raw_spin_unlock_irqrestore(&ifh->lock, flags); + + if (restart) + perf_event_restart(event); +} + +void perf_event_exec(void) +{ + struct perf_event_context *ctx; + int ctxn; + + rcu_read_lock(); + for_each_task_context_nr(ctxn) { + ctx = current->perf_event_ctxp[ctxn]; + if (!ctx) + continue; + + perf_event_enable_on_exec(ctxn); + + perf_event_aux_ctx(ctx, perf_event_addr_filters_exec, NULL, + true); + } + rcu_read_unlock(); +} + +struct remote_output { + struct ring_buffer *rb; + int err; +}; + +static void __perf_event_output_stop(struct perf_event *event, void *data) +{ + struct perf_event *parent = event->parent; + struct remote_output *ro = data; + struct ring_buffer *rb = ro->rb; + struct stop_event_data sd = { + .event = event, + }; + + if (!has_aux(event)) + return; + + if (!parent) + parent = event; + + /* + * In case of inheritance, it will be the parent that links to the + * ring-buffer, but it will be the child that's actually using it: + */ + if (rcu_dereference(parent->rb) == rb) + ro->err = __perf_event_stop(&sd); +} + +static int __perf_pmu_output_stop(void *info) +{ + struct perf_event *event = info; + struct pmu *pmu = event->pmu; + struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + struct remote_output ro = { + .rb = event->rb, + }; + + rcu_read_lock(); + perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); + if (cpuctx->task_ctx) + perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop, + &ro, false); + rcu_read_unlock(); + + return ro.err; +} + +static void perf_pmu_output_stop(struct perf_event *event) +{ + struct perf_event *iter; + int err, cpu; + +restart: + rcu_read_lock(); + list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) { + /* + * For per-CPU events, we need to make sure that neither they + * nor their children are running; for cpu==-1 events it's + * sufficient to stop the event itself if it's active, since + * it can't have children. + */ + cpu = iter->cpu; + if (cpu == -1) + cpu = READ_ONCE(iter->oncpu); + + if (cpu == -1) + continue; + + err = cpu_function_call(cpu, __perf_pmu_output_stop, event); + if (err == -EAGAIN) { + rcu_read_unlock(); + goto restart; + } + } + rcu_read_unlock(); +} + +/* * task tracking -- fork/exit * * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task @@ -6169,6 +6463,87 @@ got_name: kfree(buf); } +/* + * Whether this @filter depends on a dynamic object which is not loaded + * yet or its load addresses are not known. + */ +static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter) +{ + return filter->filter && filter->inode; +} + +/* + * Check whether inode and address range match filter criteria. + */ +static bool perf_addr_filter_match(struct perf_addr_filter *filter, + struct file *file, unsigned long offset, + unsigned long size) +{ + if (filter->inode != file->f_inode) + return false; + + if (filter->offset > offset + size) + return false; + + if (filter->offset + filter->size < offset) + return false; + + return true; +} + +static void __perf_addr_filters_adjust(struct perf_event *event, void *data) +{ + struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); + struct vm_area_struct *vma = data; + unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags; + struct file *file = vma->vm_file; + struct perf_addr_filter *filter; + unsigned int restart = 0, count = 0; + + if (!has_addr_filter(event)) + return; + + if (!file) + return; + + raw_spin_lock_irqsave(&ifh->lock, flags); + list_for_each_entry(filter, &ifh->list, entry) { + if (perf_addr_filter_match(filter, file, off, + vma->vm_end - vma->vm_start)) { + event->addr_filters_offs[count] = vma->vm_start; + restart++; + } + + count++; + } + + if (restart) + event->addr_filters_gen++; + raw_spin_unlock_irqrestore(&ifh->lock, flags); + + if (restart) + perf_event_restart(event); +} + +/* + * Adjust all task's events' filters to the new vma + */ +static void perf_addr_filters_adjust(struct vm_area_struct *vma) +{ + struct perf_event_context *ctx; + int ctxn; + + rcu_read_lock(); + for_each_task_context_nr(ctxn) { + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (!ctx) + continue; + + perf_event_aux_ctx(ctx, __perf_addr_filters_adjust, vma, true); + } + rcu_read_unlock(); +} + void perf_event_mmap(struct vm_area_struct *vma) { struct perf_mmap_event mmap_event; @@ -6200,6 +6575,7 @@ void perf_event_mmap(struct vm_area_struct *vma) /* .flags (attr_mmap2 only) */ }; + perf_addr_filters_adjust(vma); perf_event_mmap_event(&mmap_event); } @@ -6491,10 +6867,7 @@ static int __perf_event_overflow(struct perf_event *event, irq_work_queue(&event->pending); } - if (event->overflow_handler) - event->overflow_handler(event, data, regs); - else - perf_event_output(event, data, regs); + event->overflow_handler(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; @@ -7081,24 +7454,6 @@ static inline void perf_tp_register(void) perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); } -static int perf_event_set_filter(struct perf_event *event, void __user *arg) -{ - char *filter_str; - int ret; - - if (event->attr.type != PERF_TYPE_TRACEPOINT) - return -EINVAL; - - filter_str = strndup_user(arg, PAGE_SIZE); - if (IS_ERR(filter_str)) - return PTR_ERR(filter_str); - - ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); - - kfree(filter_str); - return ret; -} - static void perf_event_free_filter(struct perf_event *event) { ftrace_profile_free_filter(event); @@ -7153,11 +7508,6 @@ static inline void perf_tp_register(void) { } -static int perf_event_set_filter(struct perf_event *event, void __user *arg) -{ - return -ENOENT; -} - static void perf_event_free_filter(struct perf_event *event) { } @@ -7186,6 +7536,387 @@ void perf_bp_event(struct perf_event *bp, void *data) #endif /* + * Allocate a new address filter + */ +static struct perf_addr_filter * +perf_addr_filter_new(struct perf_event *event, struct list_head *filters) +{ + int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu); + struct perf_addr_filter *filter; + + filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node); + if (!filter) + return NULL; + + INIT_LIST_HEAD(&filter->entry); + list_add_tail(&filter->entry, filters); + + return filter; +} + +static void free_filters_list(struct list_head *filters) +{ + struct perf_addr_filter *filter, *iter; + + list_for_each_entry_safe(filter, iter, filters, entry) { + if (filter->inode) + iput(filter->inode); + list_del(&filter->entry); + kfree(filter); + } +} + +/* + * Free existing address filters and optionally install new ones + */ +static void perf_addr_filters_splice(struct perf_event *event, + struct list_head *head) +{ + unsigned long flags; + LIST_HEAD(list); + + if (!has_addr_filter(event)) + return; + + /* don't bother with children, they don't have their own filters */ + if (event->parent) + return; + + raw_spin_lock_irqsave(&event->addr_filters.lock, flags); + + list_splice_init(&event->addr_filters.list, &list); + if (head) + list_splice(head, &event->addr_filters.list); + + raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags); + + free_filters_list(&list); +} + +/* + * Scan through mm's vmas and see if one of them matches the + * @filter; if so, adjust filter's address range. + * Called with mm::mmap_sem down for reading. + */ +static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter, + struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + struct file *file = vma->vm_file; + unsigned long off = vma->vm_pgoff << PAGE_SHIFT; + unsigned long vma_size = vma->vm_end - vma->vm_start; + + if (!file) + continue; + + if (!perf_addr_filter_match(filter, file, off, vma_size)) + continue; + + return vma->vm_start; + } + + return 0; +} + +/* + * Update event's address range filters based on the + * task's existing mappings, if any. + */ +static void perf_event_addr_filters_apply(struct perf_event *event) +{ + struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); + struct task_struct *task = READ_ONCE(event->ctx->task); + struct perf_addr_filter *filter; + struct mm_struct *mm = NULL; + unsigned int count = 0; + unsigned long flags; + + /* + * We may observe TASK_TOMBSTONE, which means that the event tear-down + * will stop on the parent's child_mutex that our caller is also holding + */ + if (task == TASK_TOMBSTONE) + return; + + mm = get_task_mm(event->ctx->task); + if (!mm) + goto restart; + + down_read(&mm->mmap_sem); + + raw_spin_lock_irqsave(&ifh->lock, flags); + list_for_each_entry(filter, &ifh->list, entry) { + event->addr_filters_offs[count] = 0; + + if (perf_addr_filter_needs_mmap(filter)) + event->addr_filters_offs[count] = + perf_addr_filter_apply(filter, mm); + + count++; + } + + event->addr_filters_gen++; + raw_spin_unlock_irqrestore(&ifh->lock, flags); + + up_read(&mm->mmap_sem); + + mmput(mm); + +restart: + perf_event_restart(event); +} + +/* + * Address range filtering: limiting the data to certain + * instruction address ranges. Filters are ioctl()ed to us from + * userspace as ascii strings. + * + * Filter string format: + * + * ACTION RANGE_SPEC + * where ACTION is one of the + * * "filter": limit the trace to this region + * * "start": start tracing from this address + * * "stop": stop tracing at this address/region; + * RANGE_SPEC is + * * for kernel addresses: <start address>[/<size>] + * * for object files: <start address>[/<size>]@</path/to/object/file> + * + * if <size> is not specified, the range is treated as a single address. + */ +enum { + IF_ACT_FILTER, + IF_ACT_START, + IF_ACT_STOP, + IF_SRC_FILE, + IF_SRC_KERNEL, + IF_SRC_FILEADDR, + IF_SRC_KERNELADDR, +}; + +enum { + IF_STATE_ACTION = 0, + IF_STATE_SOURCE, + IF_STATE_END, +}; + +static const match_table_t if_tokens = { + { IF_ACT_FILTER, "filter" }, + { IF_ACT_START, "start" }, + { IF_ACT_STOP, "stop" }, + { IF_SRC_FILE, "%u/%u@%s" }, + { IF_SRC_KERNEL, "%u/%u" }, + { IF_SRC_FILEADDR, "%u@%s" }, + { IF_SRC_KERNELADDR, "%u" }, +}; + +/* + * Address filter string parser + */ +static int +perf_event_parse_addr_filter(struct perf_event *event, char *fstr, + struct list_head *filters) +{ + struct perf_addr_filter *filter = NULL; + char *start, *orig, *filename = NULL; + struct path path; + substring_t args[MAX_OPT_ARGS]; + int state = IF_STATE_ACTION, token; + unsigned int kernel = 0; + int ret = -EINVAL; + + orig = fstr = kstrdup(fstr, GFP_KERNEL); + if (!fstr) + return -ENOMEM; + + while ((start = strsep(&fstr, " ,\n")) != NULL) { + ret = -EINVAL; + + if (!*start) + continue; + + /* filter definition begins */ + if (state == IF_STATE_ACTION) { + filter = perf_addr_filter_new(event, filters); + if (!filter) + goto fail; + } + + token = match_token(start, if_tokens, args); + switch (token) { + case IF_ACT_FILTER: + case IF_ACT_START: + filter->filter = 1; + + case IF_ACT_STOP: + if (state != IF_STATE_ACTION) + goto fail; + + state = IF_STATE_SOURCE; + break; + + case IF_SRC_KERNELADDR: + case IF_SRC_KERNEL: + kernel = 1; + + case IF_SRC_FILEADDR: + case IF_SRC_FILE: + if (state != IF_STATE_SOURCE) + goto fail; + + if (token == IF_SRC_FILE || token == IF_SRC_KERNEL) + filter->range = 1; + + *args[0].to = 0; + ret = kstrtoul(args[0].from, 0, &filter->offset); + if (ret) + goto fail; + + if (filter->range) { + *args[1].to = 0; + ret = kstrtoul(args[1].from, 0, &filter->size); + if (ret) + goto fail; + } + + if (token == IF_SRC_FILE) { + filename = match_strdup(&args[2]); + if (!filename) { + ret = -ENOMEM; + goto fail; + } + } + + state = IF_STATE_END; + break; + + default: + goto fail; + } + + /* + * Filter definition is fully parsed, validate and install it. + * Make sure that it doesn't contradict itself or the event's + * attribute. + */ + if (state == IF_STATE_END) { + if (kernel && event->attr.exclude_kernel) + goto fail; + + if (!kernel) { + if (!filename) + goto fail; + + /* look up the path and grab its inode */ + ret = kern_path(filename, LOOKUP_FOLLOW, &path); + if (ret) + goto fail_free_name; + + filter->inode = igrab(d_inode(path.dentry)); + path_put(&path); + kfree(filename); + filename = NULL; + + ret = -EINVAL; + if (!filter->inode || + !S_ISREG(filter->inode->i_mode)) + /* free_filters_list() will iput() */ + goto fail; + } + + /* ready to consume more filters */ + state = IF_STATE_ACTION; + filter = NULL; + } + } + + if (state != IF_STATE_ACTION) + goto fail; + + kfree(orig); + + return 0; + +fail_free_name: + kfree(filename); +fail: + free_filters_list(filters); + kfree(orig); + + return ret; +} + +static int +perf_event_set_addr_filter(struct perf_event *event, char *filter_str) +{ + LIST_HEAD(filters); + int ret; + + /* + * Since this is called in perf_ioctl() path, we're already holding + * ctx::mutex. + */ + lockdep_assert_held(&event->ctx->mutex); + + if (WARN_ON_ONCE(event->parent)) + return -EINVAL; + + /* + * For now, we only support filtering in per-task events; doing so + * for CPU-wide events requires additional context switching trickery, + * since same object code will be mapped at different virtual + * addresses in different processes. + */ + if (!event->ctx->task) + return -EOPNOTSUPP; + + ret = perf_event_parse_addr_filter(event, filter_str, &filters); + if (ret) + return ret; + + ret = event->pmu->addr_filters_validate(&filters); + if (ret) { + free_filters_list(&filters); + return ret; + } + + /* remove existing filters, if any */ + perf_addr_filters_splice(event, &filters); + + /* install new filters */ + perf_event_for_each_child(event, perf_event_addr_filters_apply); + + return ret; +} + +static int perf_event_set_filter(struct perf_event *event, void __user *arg) +{ + char *filter_str; + int ret = -EINVAL; + + if ((event->attr.type != PERF_TYPE_TRACEPOINT || + !IS_ENABLED(CONFIG_EVENT_TRACING)) && + !has_addr_filter(event)) + return -EINVAL; + + filter_str = strndup_user(arg, PAGE_SIZE); + if (IS_ERR(filter_str)) + return PTR_ERR(filter_str); + + if (IS_ENABLED(CONFIG_EVENT_TRACING) && + event->attr.type == PERF_TYPE_TRACEPOINT) + ret = ftrace_profile_set_filter(event, event->attr.config, + filter_str); + else if (has_addr_filter(event)) + ret = perf_event_set_addr_filter(event, filter_str); + + kfree(filter_str); + return ret; +} + +/* * hrtimer based swevent callback */ @@ -7542,6 +8273,20 @@ static void free_pmu_context(struct pmu *pmu) out: mutex_unlock(&pmus_lock); } + +/* + * Let userspace know that this PMU supports address range filtering: + */ +static ssize_t nr_addr_filters_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct pmu *pmu = dev_get_drvdata(dev); + + return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); +} +DEVICE_ATTR_RO(nr_addr_filters); + static struct idr pmu_idr; static ssize_t @@ -7643,9 +8388,19 @@ static int pmu_dev_alloc(struct pmu *pmu) if (ret) goto free_dev; + /* For PMUs with address filters, throw in an extra attribute: */ + if (pmu->nr_addr_filters) + ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters); + + if (ret) + goto del_dev; + out: return ret; +del_dev: + device_del(pmu->dev); + free_dev: put_device(pmu->dev); goto out; @@ -7685,6 +8440,21 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) } skip_type: + if (pmu->task_ctx_nr == perf_hw_context) { + static int hw_context_taken = 0; + + /* + * Other than systems with heterogeneous CPUs, it never makes + * sense for two PMUs to share perf_hw_context. PMUs which are + * uncore must use perf_invalid_context. + */ + if (WARN_ON_ONCE(hw_context_taken && + !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS))) + pmu->task_ctx_nr = perf_invalid_context; + + hw_context_taken = 1; + } + pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); if (pmu->pmu_cpu_context) goto got_cpu_context; @@ -7772,6 +8542,8 @@ void perf_pmu_unregister(struct pmu *pmu) free_percpu(pmu->pmu_disable_count); if (pmu->type >= PERF_TYPE_MAX) idr_remove(&pmu_idr, pmu->type); + if (pmu->nr_addr_filters) + device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); device_del(pmu->dev); put_device(pmu->dev); free_pmu_context(pmu); @@ -7965,6 +8737,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, INIT_LIST_HEAD(&event->sibling_list); INIT_LIST_HEAD(&event->rb_entry); INIT_LIST_HEAD(&event->active_entry); + INIT_LIST_HEAD(&event->addr_filters.list); INIT_HLIST_NODE(&event->hlist_entry); @@ -7972,6 +8745,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, init_irq_work(&event->pending, perf_pending_event); mutex_init(&event->mmap_mutex); + raw_spin_lock_init(&event->addr_filters.lock); atomic_long_set(&event->refcount, 1); event->cpu = cpu; @@ -8006,8 +8780,16 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, context = parent_event->overflow_handler_context; } - event->overflow_handler = overflow_handler; - event->overflow_handler_context = context; + if (overflow_handler) { + event->overflow_handler = overflow_handler; + event->overflow_handler_context = context; + } else if (is_write_backward(event)){ + event->overflow_handler = perf_event_output_backward; + event->overflow_handler_context = NULL; + } else { + event->overflow_handler = perf_event_output_forward; + event->overflow_handler_context = NULL; + } perf_event__state_init(event); @@ -8048,11 +8830,22 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (err) goto err_pmu; + if (has_addr_filter(event)) { + event->addr_filters_offs = kcalloc(pmu->nr_addr_filters, + sizeof(unsigned long), + GFP_KERNEL); + if (!event->addr_filters_offs) + goto err_per_task; + + /* force hw sync on the address filters */ + event->addr_filters_gen = 1; + } + if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { err = get_callchain_buffers(); if (err) - goto err_per_task; + goto err_addr_filters; } } @@ -8061,6 +8854,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, return event; +err_addr_filters: + kfree(event->addr_filters_offs); + err_per_task: exclusive_event_destroy(event); @@ -8240,6 +9036,13 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) goto out; /* + * Either writing ring buffer from beginning or from end. + * Mixing is not allowed. + */ + if (is_write_backward(output_event) != is_write_backward(event)) + goto out; + + /* * If both events generate aux data, they must be on the same PMU */ if (has_aux(event) && has_aux(output_event) && diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 4199b6d193f5..05f9f6d626df 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -11,13 +11,13 @@ struct ring_buffer { atomic_t refcount; struct rcu_head rcu_head; - struct irq_work irq_work; #ifdef CONFIG_PERF_USE_VMALLOC struct work_struct work; int page_order; /* allocation order */ #endif int nr_pages; /* nr of data pages */ int overwrite; /* can overwrite itself */ + int paused; /* can write into ring buffer */ atomic_t poll; /* POLL_ for wakeups */ @@ -65,6 +65,14 @@ static inline void rb_free_rcu(struct rcu_head *rcu_head) rb_free(rb); } +static inline void rb_toggle_paused(struct ring_buffer *rb, bool pause) +{ + if (!pause && rb->nr_pages) + rb->paused = 0; + else + rb->paused = 1; +} + extern struct ring_buffer * rb_alloc(int nr_pages, long watermark, int cpu, int flags); extern void perf_event_wakeup(struct perf_event *event); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index c61f0cbd308b..ae9b90dc9a5a 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -102,8 +102,21 @@ out: preempt_enable(); } -int perf_output_begin(struct perf_output_handle *handle, - struct perf_event *event, unsigned int size) +static bool __always_inline +ring_buffer_has_space(unsigned long head, unsigned long tail, + unsigned long data_size, unsigned int size, + bool backward) +{ + if (!backward) + return CIRC_SPACE(head, tail, data_size) >= size; + else + return CIRC_SPACE(tail, head, data_size) >= size; +} + +static int __always_inline +__perf_output_begin(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size, + bool backward) { struct ring_buffer *rb; unsigned long tail, offset, head; @@ -125,8 +138,11 @@ int perf_output_begin(struct perf_output_handle *handle, if (unlikely(!rb)) goto out; - if (unlikely(!rb->nr_pages)) + if (unlikely(rb->paused)) { + if (rb->nr_pages) + local_inc(&rb->lost); goto out; + } handle->rb = rb; handle->event = event; @@ -143,9 +159,12 @@ int perf_output_begin(struct perf_output_handle *handle, do { tail = READ_ONCE(rb->user_page->data_tail); offset = head = local_read(&rb->head); - if (!rb->overwrite && - unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) - goto fail; + if (!rb->overwrite) { + if (unlikely(!ring_buffer_has_space(head, tail, + perf_data_size(rb), + size, backward))) + goto fail; + } /* * The above forms a control dependency barrier separating the @@ -159,9 +178,17 @@ int perf_output_begin(struct perf_output_handle *handle, * See perf_output_put_handle(). */ - head += size; + if (!backward) + head += size; + else + head -= size; } while (local_cmpxchg(&rb->head, offset, head) != offset); + if (backward) { + offset = head; + head = (u64)(-head); + } + /* * We rely on the implied barrier() by local_cmpxchg() to ensure * none of the data stores below can be lifted up by the compiler. @@ -203,6 +230,26 @@ out: return -ENOSPC; } +int perf_output_begin_forward(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size) +{ + return __perf_output_begin(handle, event, size, false); +} + +int perf_output_begin_backward(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size) +{ + return __perf_output_begin(handle, event, size, true); +} + +int perf_output_begin(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size) +{ + + return __perf_output_begin(handle, event, size, + unlikely(is_write_backward(event))); +} + unsigned int perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len) { @@ -221,8 +268,6 @@ void perf_output_end(struct perf_output_handle *handle) rcu_read_unlock(); } -static void rb_irq_work(struct irq_work *work); - static void ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) { @@ -243,16 +288,13 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) INIT_LIST_HEAD(&rb->event_list); spin_lock_init(&rb->event_lock); - init_irq_work(&rb->irq_work, rb_irq_work); -} -static void ring_buffer_put_async(struct ring_buffer *rb) -{ - if (!atomic_dec_and_test(&rb->refcount)) - return; - - rb->rcu_head.next = (void *)rb; - irq_work_queue(&rb->irq_work); + /* + * perf_output_begin() only checks rb->paused, therefore + * rb->paused must be true if we have no pages for output. + */ + if (!rb->nr_pages) + rb->paused = 1; } /* @@ -264,6 +306,10 @@ static void ring_buffer_put_async(struct ring_buffer *rb) * The ordering is similar to that of perf_output_{begin,end}, with * the exception of (B), which should be taken care of by the pmu * driver, since ordering rules will differ depending on hardware. + * + * Call this from pmu::start(); see the comment in perf_aux_output_end() + * about its use in pmu callbacks. Both can also be called from the PMI + * handler if needed. */ void *perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event) @@ -288,6 +334,13 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, goto err; /* + * If rb::aux_mmap_count is zero (and rb_has_aux() above went through), + * the aux buffer is in perf_mmap_close(), about to get freed. + */ + if (!atomic_read(&rb->aux_mmap_count)) + goto err_put; + + /* * Nesting is not supported for AUX area, make sure nested * writers are caught early */ @@ -328,10 +381,11 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, return handle->rb->aux_priv; err_put: + /* can't be last */ rb_free_aux(rb); err: - ring_buffer_put_async(rb); + ring_buffer_put(rb); handle->event = NULL; return NULL; @@ -342,11 +396,16 @@ err: * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the * pmu driver's responsibility to observe ordering rules of the hardware, * so that all the data is externally visible before this is called. + * + * Note: this has to be called from pmu::stop() callback, as the assumption + * of the AUX buffer management code is that after pmu::stop(), the AUX + * transaction must be stopped and therefore drop the AUX reference count. */ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, bool truncated) { struct ring_buffer *rb = handle->rb; + bool wakeup = truncated; unsigned long aux_head; u64 flags = 0; @@ -375,14 +434,22 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { - perf_output_wakeup(handle); + wakeup = true; local_add(rb->aux_watermark, &rb->aux_wakeup); } + + if (wakeup) { + if (truncated) + handle->event->pending_disable = 1; + perf_output_wakeup(handle); + } + handle->event = NULL; local_set(&rb->aux_nest, 0); + /* can't be last */ rb_free_aux(rb); - ring_buffer_put_async(rb); + ring_buffer_put(rb); } /* @@ -463,6 +530,14 @@ static void __rb_free_aux(struct ring_buffer *rb) { int pg; + /* + * Should never happen, the last reference should be dropped from + * perf_mmap_close() path, which first stops aux transactions (which + * in turn are the atomic holders of aux_refcount) and then does the + * last rb_free_aux(). + */ + WARN_ON_ONCE(in_atomic()); + if (rb->aux_priv) { rb->free_aux(rb->aux_priv); rb->free_aux = NULL; @@ -574,18 +649,7 @@ out: void rb_free_aux(struct ring_buffer *rb) { if (atomic_dec_and_test(&rb->aux_refcount)) - irq_work_queue(&rb->irq_work); -} - -static void rb_irq_work(struct irq_work *work) -{ - struct ring_buffer *rb = container_of(work, struct ring_buffer, irq_work); - - if (!atomic_read(&rb->aux_refcount)) __rb_free_aux(rb); - - if (rb->rcu_head.next == (void *)rb) - call_rcu(&rb->rcu_head, rb_free_rcu); } #ifndef CONFIG_PERF_USE_VMALLOC diff --git a/kernel/fork.c b/kernel/fork.c index d277e83ed3e0..3e8451527cbe 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1494,7 +1494,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, * sigaltstack should be cleared when sharing the same VM */ if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) - p->sas_ss_sp = p->sas_ss_size = 0; + sas_ss_reset(p); /* * Syscall tracing and stepping should be turned off in the diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 78c1c0ee6dc1..81f1a7107c0e 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -45,6 +45,7 @@ #include <linux/bitops.h> #include <linux/gfp.h> #include <linux/kmemcheck.h> +#include <linux/random.h> #include <asm/sections.h> @@ -708,7 +709,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) * yet. Otherwise we look it up. We cache the result in the lock object * itself, so actual lookup of the hash should be once per lock object. */ -static inline struct lock_class * +static struct lock_class * register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) { struct lockdep_subclass_key *key; @@ -3585,7 +3586,35 @@ static int __lock_is_held(struct lockdep_map *lock) return 0; } -static void __lock_pin_lock(struct lockdep_map *lock) +static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock) +{ + struct pin_cookie cookie = NIL_COOKIE; + struct task_struct *curr = current; + int i; + + if (unlikely(!debug_locks)) + return cookie; + + for (i = 0; i < curr->lockdep_depth; i++) { + struct held_lock *hlock = curr->held_locks + i; + + if (match_held_lock(hlock, lock)) { + /* + * Grab 16bits of randomness; this is sufficient to not + * be guessable and still allows some pin nesting in + * our u32 pin_count. + */ + cookie.val = 1 + (prandom_u32() >> 16); + hlock->pin_count += cookie.val; + return cookie; + } + } + + WARN(1, "pinning an unheld lock\n"); + return cookie; +} + +static void __lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie) { struct task_struct *curr = current; int i; @@ -3597,7 +3626,7 @@ static void __lock_pin_lock(struct lockdep_map *lock) struct held_lock *hlock = curr->held_locks + i; if (match_held_lock(hlock, lock)) { - hlock->pin_count++; + hlock->pin_count += cookie.val; return; } } @@ -3605,7 +3634,7 @@ static void __lock_pin_lock(struct lockdep_map *lock) WARN(1, "pinning an unheld lock\n"); } -static void __lock_unpin_lock(struct lockdep_map *lock) +static void __lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie) { struct task_struct *curr = current; int i; @@ -3620,7 +3649,11 @@ static void __lock_unpin_lock(struct lockdep_map *lock) if (WARN(!hlock->pin_count, "unpinning an unpinned lock\n")) return; - hlock->pin_count--; + hlock->pin_count -= cookie.val; + + if (WARN((int)hlock->pin_count < 0, "pin count corrupted\n")) + hlock->pin_count = 0; + return; } } @@ -3751,24 +3784,44 @@ int lock_is_held(struct lockdep_map *lock) } EXPORT_SYMBOL_GPL(lock_is_held); -void lock_pin_lock(struct lockdep_map *lock) +struct pin_cookie lock_pin_lock(struct lockdep_map *lock) { + struct pin_cookie cookie = NIL_COOKIE; unsigned long flags; if (unlikely(current->lockdep_recursion)) - return; + return cookie; raw_local_irq_save(flags); check_flags(flags); current->lockdep_recursion = 1; - __lock_pin_lock(lock); + cookie = __lock_pin_lock(lock); current->lockdep_recursion = 0; raw_local_irq_restore(flags); + + return cookie; } EXPORT_SYMBOL_GPL(lock_pin_lock); -void lock_unpin_lock(struct lockdep_map *lock) +void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie) +{ + unsigned long flags; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + + current->lockdep_recursion = 1; + __lock_repin_lock(lock, cookie); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_repin_lock); + +void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie) { unsigned long flags; @@ -3779,7 +3832,7 @@ void lock_unpin_lock(struct lockdep_map *lock) check_flags(flags); current->lockdep_recursion = 1; - __lock_unpin_lock(lock); + __lock_unpin_lock(lock, cookie); current->lockdep_recursion = 0; raw_local_irq_restore(flags); } diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 8ef1919d63b2..f8c5af52a131 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -75,12 +75,7 @@ struct lock_stress_stats { long n_lock_acquired; }; -#if defined(MODULE) -#define LOCKTORTURE_RUNNABLE_INIT 1 -#else -#define LOCKTORTURE_RUNNABLE_INIT 0 -#endif -int torture_runnable = LOCKTORTURE_RUNNABLE_INIT; +int torture_runnable = IS_ENABLED(MODULE); module_param(torture_runnable, int, 0444); MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init"); @@ -394,12 +389,12 @@ static void torture_rtmutex_boost(struct torture_random_state *trsp) if (!rt_task(current)) { /* - * (1) Boost priority once every ~50k operations. When the + * Boost priority once every ~50k operations. When the * task tries to take the lock, the rtmutex it will account * for the new priority, and do any corresponding pi-dance. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * factor))) { + if (trsp && !(torture_random(trsp) % + (cxt.nrealwriters_stress * factor))) { policy = SCHED_FIFO; param.sched_priority = MAX_RT_PRIO - 1; } else /* common case, do nothing */ @@ -748,6 +743,15 @@ static void lock_torture_cleanup(void) if (torture_cleanup_begin()) return; + /* + * Indicates early cleanup, meaning that the test has not run, + * such as when passing bogus args when loading the module. As + * such, only perform the underlying torture-specific cleanups, + * and avoid anything related to locktorture. + */ + if (!cxt.lwsa) + goto end; + if (writer_tasks) { for (i = 0; i < cxt.nrealwriters_stress; i++) torture_stop_kthread(lock_torture_writer, @@ -776,6 +780,7 @@ static void lock_torture_cleanup(void) else lock_torture_print_module_parms(cxt.cur_ops, "End of test: SUCCESS"); +end: torture_cleanup_end(); } @@ -870,6 +875,7 @@ static int __init lock_torture_init(void) VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory"); firsterr = -ENOMEM; kfree(cxt.lwsa); + cxt.lwsa = NULL; goto unwind; } @@ -878,6 +884,7 @@ static int __init lock_torture_init(void) cxt.lrsa[i].n_lock_acquired = 0; } } + lock_torture_print_module_parms(cxt.cur_ops, "Start of test"); /* Prepare torture context. */ diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index d734b7502001..22e025309845 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -191,8 +191,6 @@ static ssize_t qstat_write(struct file *file, const char __user *user_buf, for (i = 0 ; i < qstat_num; i++) WRITE_ONCE(ptr[i], 0); - for (i = 0 ; i < qstat_num; i++) - WRITE_ONCE(ptr[i], 0); } return count; } @@ -214,10 +212,8 @@ static int __init init_qspinlock_stat(void) struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL); int i; - if (!d_qstat) { - pr_warn("Could not create 'qlockstat' debugfs directory\n"); - return 0; - } + if (!d_qstat) + goto out; /* * Create the debugfs files @@ -227,12 +223,20 @@ static int __init init_qspinlock_stat(void) * performance. */ for (i = 0; i < qstat_num; i++) - debugfs_create_file(qstat_names[i], 0400, d_qstat, - (void *)(long)i, &fops_qstat); + if (!debugfs_create_file(qstat_names[i], 0400, d_qstat, + (void *)(long)i, &fops_qstat)) + goto fail_undo; + + if (!debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat, + (void *)(long)qstat_reset_cnts, &fops_qstat)) + goto fail_undo; - debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat, - (void *)(long)qstat_reset_cnts, &fops_qstat); return 0; +fail_undo: + debugfs_remove_recursive(d_qstat); +out: + pr_warn("Could not create 'qlockstat' debugfs entries\n"); + return -ENOMEM; } fs_initcall(init_qspinlock_stat); diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 3a5048572065..1591f6b3539f 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -191,11 +191,12 @@ int __down_read_trylock(struct rw_semaphore *sem) /* * get a write lock on the semaphore */ -void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) +int __sched __down_write_common(struct rw_semaphore *sem, int state) { struct rwsem_waiter waiter; struct task_struct *tsk; unsigned long flags; + int ret = 0; raw_spin_lock_irqsave(&sem->wait_lock, flags); @@ -215,21 +216,33 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) */ if (sem->count == 0) break; - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (signal_pending_state(state, current)) { + ret = -EINTR; + goto out; + } + set_task_state(tsk, state); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); schedule(); raw_spin_lock_irqsave(&sem->wait_lock, flags); } /* got the lock */ sem->count = -1; +out: list_del(&waiter.list); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + return ret; } void __sched __down_write(struct rw_semaphore *sem) { - __down_write_nested(sem, 0); + __down_write_common(sem, TASK_UNINTERRUPTIBLE); +} + +int __sched __down_write_killable(struct rw_semaphore *sem) +{ + return __down_write_common(sem, TASK_KILLABLE); } /* diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index a4d4de05b2d1..09e30c6225e5 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -433,12 +433,13 @@ static inline bool rwsem_has_spinner(struct rw_semaphore *sem) /* * Wait until we successfully acquire the write lock */ -__visible -struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) +static inline struct rw_semaphore * +__rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) { long count; bool waiting = true; /* any queued threads before us */ struct rwsem_waiter waiter; + struct rw_semaphore *ret = sem; /* undo write bias from down_write operation, stop active locking */ count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem); @@ -478,7 +479,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); /* wait until we successfully acquire the lock */ - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(state); while (true) { if (rwsem_try_write_lock(count, sem)) break; @@ -486,21 +487,48 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) /* Block until there are no active lockers. */ do { + if (signal_pending_state(state, current)) + goto out_nolock; + schedule(); - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(state); } while ((count = sem->count) & RWSEM_ACTIVE_MASK); raw_spin_lock_irq(&sem->wait_lock); } __set_current_state(TASK_RUNNING); + list_del(&waiter.list); + raw_spin_unlock_irq(&sem->wait_lock); + return ret; + +out_nolock: + __set_current_state(TASK_RUNNING); + raw_spin_lock_irq(&sem->wait_lock); list_del(&waiter.list); + if (list_empty(&sem->wait_list)) + rwsem_atomic_update(-RWSEM_WAITING_BIAS, sem); + else + __rwsem_do_wake(sem, RWSEM_WAKE_ANY); raw_spin_unlock_irq(&sem->wait_lock); - return sem; + return ERR_PTR(-EINTR); +} + +__visible struct rw_semaphore * __sched +rwsem_down_write_failed(struct rw_semaphore *sem) +{ + return __rwsem_down_write_failed_common(sem, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(rwsem_down_write_failed); +__visible struct rw_semaphore * __sched +rwsem_down_write_failed_killable(struct rw_semaphore *sem) +{ + return __rwsem_down_write_failed_common(sem, TASK_KILLABLE); +} +EXPORT_SYMBOL(rwsem_down_write_failed_killable); + /* * handle waking up a waiter on the semaphore * - up_read/up_write has decremented the active part of count if we come here diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 205be0ce34de..c817216c1615 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -55,6 +55,25 @@ void __sched down_write(struct rw_semaphore *sem) EXPORT_SYMBOL(down_write); /* + * lock for writing + */ +int __sched down_write_killable(struct rw_semaphore *sem) +{ + might_sleep(); + rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); + + if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) { + rwsem_release(&sem->dep_map, 1, _RET_IP_); + return -EINTR; + } + + rwsem_set_owner(sem); + return 0; +} + +EXPORT_SYMBOL(down_write_killable); + +/* * trylock for writing -- returns 1 if successful, 0 if contention */ int down_write_trylock(struct rw_semaphore *sem) diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 032b2c015beb..18dfc485225c 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -5,6 +5,7 @@ KCOV_INSTRUMENT := n obj-y += update.o sync.o obj-$(CONFIG_SRCU) += srcu.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o +obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_PREEMPT_RCU) += tree.o obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c new file mode 100644 index 000000000000..3cee0d8393ed --- /dev/null +++ b/kernel/rcu/rcuperf.c @@ -0,0 +1,655 @@ +/* + * Read-Copy Update module-based performance-test facility + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2015 + * + * Authors: Paul E. McKenney <paulmck@us.ibm.com> + */ +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/err.h> +#include <linux/spinlock.h> +#include <linux/smp.h> +#include <linux/rcupdate.h> +#include <linux/interrupt.h> +#include <linux/sched.h> +#include <linux/atomic.h> +#include <linux/bitops.h> +#include <linux/completion.h> +#include <linux/moduleparam.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/reboot.h> +#include <linux/freezer.h> +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/stat.h> +#include <linux/srcu.h> +#include <linux/slab.h> +#include <asm/byteorder.h> +#include <linux/torture.h> +#include <linux/vmalloc.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); + +#define PERF_FLAG "-perf:" +#define PERFOUT_STRING(s) \ + pr_alert("%s" PERF_FLAG s "\n", perf_type) +#define VERBOSE_PERFOUT_STRING(s) \ + do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0) +#define VERBOSE_PERFOUT_ERRSTRING(s) \ + do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) + +torture_param(bool, gp_exp, true, "Use expedited GP wait primitives"); +torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); +torture_param(int, nreaders, -1, "Number of RCU reader threads"); +torture_param(int, nwriters, -1, "Number of RCU updater threads"); +torture_param(bool, shutdown, false, "Shutdown at end of performance tests."); +torture_param(bool, verbose, true, "Enable verbose debugging printk()s"); + +static char *perf_type = "rcu"; +module_param(perf_type, charp, 0444); +MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, ...)"); + +static int nrealreaders; +static int nrealwriters; +static struct task_struct **writer_tasks; +static struct task_struct **reader_tasks; +static struct task_struct *shutdown_task; + +static u64 **writer_durations; +static int *writer_n_durations; +static atomic_t n_rcu_perf_reader_started; +static atomic_t n_rcu_perf_writer_started; +static atomic_t n_rcu_perf_writer_finished; +static wait_queue_head_t shutdown_wq; +static u64 t_rcu_perf_writer_started; +static u64 t_rcu_perf_writer_finished; +static unsigned long b_rcu_perf_writer_started; +static unsigned long b_rcu_perf_writer_finished; + +static int rcu_perf_writer_state; +#define RTWS_INIT 0 +#define RTWS_EXP_SYNC 1 +#define RTWS_SYNC 2 +#define RTWS_IDLE 2 +#define RTWS_STOPPING 3 + +#define MAX_MEAS 10000 +#define MIN_MEAS 100 + +#if defined(MODULE) || defined(CONFIG_RCU_PERF_TEST_RUNNABLE) +#define RCUPERF_RUNNABLE_INIT 1 +#else +#define RCUPERF_RUNNABLE_INIT 0 +#endif +static int perf_runnable = RCUPERF_RUNNABLE_INIT; +module_param(perf_runnable, int, 0444); +MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot"); + +/* + * Operations vector for selecting different types of tests. + */ + +struct rcu_perf_ops { + int ptype; + void (*init)(void); + void (*cleanup)(void); + int (*readlock)(void); + void (*readunlock)(int idx); + unsigned long (*started)(void); + unsigned long (*completed)(void); + unsigned long (*exp_completed)(void); + void (*sync)(void); + void (*exp_sync)(void); + const char *name; +}; + +static struct rcu_perf_ops *cur_ops; + +/* + * Definitions for rcu perf testing. + */ + +static int rcu_perf_read_lock(void) __acquires(RCU) +{ + rcu_read_lock(); + return 0; +} + +static void rcu_perf_read_unlock(int idx) __releases(RCU) +{ + rcu_read_unlock(); +} + +static unsigned long __maybe_unused rcu_no_completed(void) +{ + return 0; +} + +static void rcu_sync_perf_init(void) +{ +} + +static struct rcu_perf_ops rcu_ops = { + .ptype = RCU_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = rcu_perf_read_lock, + .readunlock = rcu_perf_read_unlock, + .started = rcu_batches_started, + .completed = rcu_batches_completed, + .exp_completed = rcu_exp_batches_completed, + .sync = synchronize_rcu, + .exp_sync = synchronize_rcu_expedited, + .name = "rcu" +}; + +/* + * Definitions for rcu_bh perf testing. + */ + +static int rcu_bh_perf_read_lock(void) __acquires(RCU_BH) +{ + rcu_read_lock_bh(); + return 0; +} + +static void rcu_bh_perf_read_unlock(int idx) __releases(RCU_BH) +{ + rcu_read_unlock_bh(); +} + +static struct rcu_perf_ops rcu_bh_ops = { + .ptype = RCU_BH_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = rcu_bh_perf_read_lock, + .readunlock = rcu_bh_perf_read_unlock, + .started = rcu_batches_started_bh, + .completed = rcu_batches_completed_bh, + .exp_completed = rcu_exp_batches_completed_sched, + .sync = synchronize_rcu_bh, + .exp_sync = synchronize_rcu_bh_expedited, + .name = "rcu_bh" +}; + +/* + * Definitions for srcu perf testing. + */ + +DEFINE_STATIC_SRCU(srcu_ctl_perf); +static struct srcu_struct *srcu_ctlp = &srcu_ctl_perf; + +static int srcu_perf_read_lock(void) __acquires(srcu_ctlp) +{ + return srcu_read_lock(srcu_ctlp); +} + +static void srcu_perf_read_unlock(int idx) __releases(srcu_ctlp) +{ + srcu_read_unlock(srcu_ctlp, idx); +} + +static unsigned long srcu_perf_completed(void) +{ + return srcu_batches_completed(srcu_ctlp); +} + +static void srcu_perf_synchronize(void) +{ + synchronize_srcu(srcu_ctlp); +} + +static void srcu_perf_synchronize_expedited(void) +{ + synchronize_srcu_expedited(srcu_ctlp); +} + +static struct rcu_perf_ops srcu_ops = { + .ptype = SRCU_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = srcu_perf_read_lock, + .readunlock = srcu_perf_read_unlock, + .started = NULL, + .completed = srcu_perf_completed, + .exp_completed = srcu_perf_completed, + .sync = srcu_perf_synchronize, + .exp_sync = srcu_perf_synchronize_expedited, + .name = "srcu" +}; + +/* + * Definitions for sched perf testing. + */ + +static int sched_perf_read_lock(void) +{ + preempt_disable(); + return 0; +} + +static void sched_perf_read_unlock(int idx) +{ + preempt_enable(); +} + +static struct rcu_perf_ops sched_ops = { + .ptype = RCU_SCHED_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = sched_perf_read_lock, + .readunlock = sched_perf_read_unlock, + .started = rcu_batches_started_sched, + .completed = rcu_batches_completed_sched, + .exp_completed = rcu_exp_batches_completed_sched, + .sync = synchronize_sched, + .exp_sync = synchronize_sched_expedited, + .name = "sched" +}; + +#ifdef CONFIG_TASKS_RCU + +/* + * Definitions for RCU-tasks perf testing. + */ + +static int tasks_perf_read_lock(void) +{ + return 0; +} + +static void tasks_perf_read_unlock(int idx) +{ +} + +static struct rcu_perf_ops tasks_ops = { + .ptype = RCU_TASKS_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = tasks_perf_read_lock, + .readunlock = tasks_perf_read_unlock, + .started = rcu_no_completed, + .completed = rcu_no_completed, + .sync = synchronize_rcu_tasks, + .exp_sync = synchronize_rcu_tasks, + .name = "tasks" +}; + +#define RCUPERF_TASKS_OPS &tasks_ops, + +static bool __maybe_unused torturing_tasks(void) +{ + return cur_ops == &tasks_ops; +} + +#else /* #ifdef CONFIG_TASKS_RCU */ + +#define RCUPERF_TASKS_OPS + +static bool __maybe_unused torturing_tasks(void) +{ + return false; +} + +#endif /* #else #ifdef CONFIG_TASKS_RCU */ + +/* + * If performance tests complete, wait for shutdown to commence. + */ +static void rcu_perf_wait_shutdown(void) +{ + cond_resched_rcu_qs(); + if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters) + return; + while (!torture_must_stop()) + schedule_timeout_uninterruptible(1); +} + +/* + * RCU perf reader kthread. Repeatedly does empty RCU read-side + * critical section, minimizing update-side interference. + */ +static int +rcu_perf_reader(void *arg) +{ + unsigned long flags; + int idx; + long me = (long)arg; + + VERBOSE_PERFOUT_STRING("rcu_perf_reader task started"); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + atomic_inc(&n_rcu_perf_reader_started); + + do { + local_irq_save(flags); + idx = cur_ops->readlock(); + cur_ops->readunlock(idx); + local_irq_restore(flags); + rcu_perf_wait_shutdown(); + } while (!torture_must_stop()); + torture_kthread_stopping("rcu_perf_reader"); + return 0; +} + +/* + * RCU perf writer kthread. Repeatedly does a grace period. + */ +static int +rcu_perf_writer(void *arg) +{ + int i = 0; + int i_max; + long me = (long)arg; + struct sched_param sp; + bool started = false, done = false, alldone = false; + u64 t; + u64 *wdp; + u64 *wdpp = writer_durations[me]; + + VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); + WARN_ON(rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp); + WARN_ON(rcu_gp_is_normal() && gp_exp); + WARN_ON(!wdpp); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + sp.sched_priority = 1; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + + if (holdoff) + schedule_timeout_uninterruptible(holdoff * HZ); + + t = ktime_get_mono_fast_ns(); + if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { + t_rcu_perf_writer_started = t; + if (gp_exp) { + b_rcu_perf_writer_started = + cur_ops->exp_completed() / 2; + } else { + b_rcu_perf_writer_started = + cur_ops->completed(); + } + } + + do { + wdp = &wdpp[i]; + *wdp = ktime_get_mono_fast_ns(); + if (gp_exp) { + rcu_perf_writer_state = RTWS_EXP_SYNC; + cur_ops->exp_sync(); + } else { + rcu_perf_writer_state = RTWS_SYNC; + cur_ops->sync(); + } + rcu_perf_writer_state = RTWS_IDLE; + t = ktime_get_mono_fast_ns(); + *wdp = t - *wdp; + i_max = i; + if (!started && + atomic_read(&n_rcu_perf_writer_started) >= nrealwriters) + started = true; + if (!done && i >= MIN_MEAS) { + done = true; + sp.sched_priority = 0; + sched_setscheduler_nocheck(current, + SCHED_NORMAL, &sp); + pr_alert("%s" PERF_FLAG + "rcu_perf_writer %ld has %d measurements\n", + perf_type, me, MIN_MEAS); + if (atomic_inc_return(&n_rcu_perf_writer_finished) >= + nrealwriters) { + schedule_timeout_interruptible(10); + rcu_ftrace_dump(DUMP_ALL); + PERFOUT_STRING("Test complete"); + t_rcu_perf_writer_finished = t; + if (gp_exp) { + b_rcu_perf_writer_finished = + cur_ops->exp_completed() / 2; + } else { + b_rcu_perf_writer_finished = + cur_ops->completed(); + } + if (shutdown) { + smp_mb(); /* Assign before wake. */ + wake_up(&shutdown_wq); + } + } + } + if (done && !alldone && + atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters) + alldone = true; + if (started && !alldone && i < MAX_MEAS - 1) + i++; + rcu_perf_wait_shutdown(); + } while (!torture_must_stop()); + rcu_perf_writer_state = RTWS_STOPPING; + writer_n_durations[me] = i_max; + torture_kthread_stopping("rcu_perf_writer"); + return 0; +} + +static inline void +rcu_perf_print_module_parms(struct rcu_perf_ops *cur_ops, const char *tag) +{ + pr_alert("%s" PERF_FLAG + "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n", + perf_type, tag, nrealreaders, nrealwriters, verbose, shutdown); +} + +static void +rcu_perf_cleanup(void) +{ + int i; + int j; + int ngps = 0; + u64 *wdp; + u64 *wdpp; + + if (torture_cleanup_begin()) + return; + + if (reader_tasks) { + for (i = 0; i < nrealreaders; i++) + torture_stop_kthread(rcu_perf_reader, + reader_tasks[i]); + kfree(reader_tasks); + } + + if (writer_tasks) { + for (i = 0; i < nrealwriters; i++) { + torture_stop_kthread(rcu_perf_writer, + writer_tasks[i]); + if (!writer_n_durations) + continue; + j = writer_n_durations[i]; + pr_alert("%s%s writer %d gps: %d\n", + perf_type, PERF_FLAG, i, j); + ngps += j; + } + pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n", + perf_type, PERF_FLAG, + t_rcu_perf_writer_started, t_rcu_perf_writer_finished, + t_rcu_perf_writer_finished - + t_rcu_perf_writer_started, + ngps, + b_rcu_perf_writer_finished - + b_rcu_perf_writer_started); + for (i = 0; i < nrealwriters; i++) { + if (!writer_durations) + break; + if (!writer_n_durations) + continue; + wdpp = writer_durations[i]; + if (!wdpp) + continue; + for (j = 0; j <= writer_n_durations[i]; j++) { + wdp = &wdpp[j]; + pr_alert("%s%s %4d writer-duration: %5d %llu\n", + perf_type, PERF_FLAG, + i, j, *wdp); + if (j % 100 == 0) + schedule_timeout_uninterruptible(1); + } + kfree(writer_durations[i]); + } + kfree(writer_tasks); + kfree(writer_durations); + kfree(writer_n_durations); + } + + /* Do flavor-specific cleanup operations. */ + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); + + torture_cleanup_end(); +} + +/* + * Return the number if non-negative. If -1, the number of CPUs. + * If less than -1, that much less than the number of CPUs, but + * at least one. + */ +static int compute_real(int n) +{ + int nr; + + if (n >= 0) { + nr = n; + } else { + nr = num_online_cpus() + 1 + n; + if (nr <= 0) + nr = 1; + } + return nr; +} + +/* + * RCU perf shutdown kthread. Just waits to be awakened, then shuts + * down system. + */ +static int +rcu_perf_shutdown(void *arg) +{ + do { + wait_event(shutdown_wq, + atomic_read(&n_rcu_perf_writer_finished) >= + nrealwriters); + } while (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters); + smp_mb(); /* Wake before output. */ + rcu_perf_cleanup(); + kernel_power_off(); + return -EINVAL; +} + +static int __init +rcu_perf_init(void) +{ + long i; + int firsterr = 0; + static struct rcu_perf_ops *perf_ops[] = { + &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, + RCUPERF_TASKS_OPS + }; + + if (!torture_init_begin(perf_type, verbose, &perf_runnable)) + return -EBUSY; + + /* Process args and tell the world that the perf'er is on the job. */ + for (i = 0; i < ARRAY_SIZE(perf_ops); i++) { + cur_ops = perf_ops[i]; + if (strcmp(perf_type, cur_ops->name) == 0) + break; + } + if (i == ARRAY_SIZE(perf_ops)) { + pr_alert("rcu-perf: invalid perf type: \"%s\"\n", + perf_type); + pr_alert("rcu-perf types:"); + for (i = 0; i < ARRAY_SIZE(perf_ops); i++) + pr_alert(" %s", perf_ops[i]->name); + pr_alert("\n"); + firsterr = -EINVAL; + goto unwind; + } + if (cur_ops->init) + cur_ops->init(); + + nrealwriters = compute_real(nwriters); + nrealreaders = compute_real(nreaders); + atomic_set(&n_rcu_perf_reader_started, 0); + atomic_set(&n_rcu_perf_writer_started, 0); + atomic_set(&n_rcu_perf_writer_finished, 0); + rcu_perf_print_module_parms(cur_ops, "Start of test"); + + /* Start up the kthreads. */ + + if (shutdown) { + init_waitqueue_head(&shutdown_wq); + firsterr = torture_create_kthread(rcu_perf_shutdown, NULL, + shutdown_task); + if (firsterr) + goto unwind; + schedule_timeout_uninterruptible(1); + } + reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), + GFP_KERNEL); + if (reader_tasks == NULL) { + VERBOSE_PERFOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealreaders; i++) { + firsterr = torture_create_kthread(rcu_perf_reader, (void *)i, + reader_tasks[i]); + if (firsterr) + goto unwind; + } + while (atomic_read(&n_rcu_perf_reader_started) < nrealreaders) + schedule_timeout_uninterruptible(1); + writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]), + GFP_KERNEL); + writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), + GFP_KERNEL); + writer_n_durations = + kcalloc(nrealwriters, sizeof(*writer_n_durations), + GFP_KERNEL); + if (!writer_tasks || !writer_durations || !writer_n_durations) { + VERBOSE_PERFOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealwriters; i++) { + writer_durations[i] = + kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), + GFP_KERNEL); + if (!writer_durations[i]) + goto unwind; + firsterr = torture_create_kthread(rcu_perf_writer, (void *)i, + writer_tasks[i]); + if (firsterr) + goto unwind; + } + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + rcu_perf_cleanup(); + return firsterr; +} + +module_init(rcu_perf_init); +module_exit(rcu_perf_cleanup); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 250ea67c1615..084a28a732eb 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -130,8 +130,8 @@ static struct rcu_torture __rcu *rcu_torture_current; static unsigned long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = { 0 }; -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count); +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch); static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; static atomic_t n_rcu_torture_alloc; static atomic_t n_rcu_torture_alloc_fail; @@ -916,7 +916,7 @@ rcu_torture_fqs(void *arg) static int rcu_torture_writer(void *arg) { - bool can_expedite = !rcu_gp_is_expedited(); + bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal(); int expediting = 0; unsigned long gp_snap; bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; @@ -932,7 +932,7 @@ rcu_torture_writer(void *arg) VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); if (!can_expedite) { pr_alert("%s" TORTURE_FLAG - " Grace periods expedited from boot/sysfs for %s,\n", + " GP expediting controlled from boot/sysfs for %s,\n", torture_type, cur_ops->name); pr_alert("%s" TORTURE_FLAG " Disabled dynamic grace-period expediting.\n", @@ -1082,17 +1082,6 @@ rcu_torture_fakewriter(void *arg) return 0; } -static void rcutorture_trace_dump(void) -{ - static atomic_t beenhere = ATOMIC_INIT(0); - - if (atomic_read(&beenhere)) - return; - if (atomic_xchg(&beenhere, 1) != 0) - return; - ftrace_dump(DUMP_ALL); -} - /* * RCU torture reader from timer handler. Dereferences rcu_torture_current, * incrementing the corresponding element of the pipeline array. The @@ -1142,7 +1131,7 @@ static void rcu_torture_timer(unsigned long unused) if (pipe_count > 1) { do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, started, completed); - rcutorture_trace_dump(); + rcu_ftrace_dump(DUMP_ALL); } __this_cpu_inc(rcu_torture_count[pipe_count]); completed = completed - started; @@ -1215,7 +1204,7 @@ rcu_torture_reader(void *arg) if (pipe_count > 1) { do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, started, completed); - rcutorture_trace_dump(); + rcu_ftrace_dump(DUMP_ALL); } __this_cpu_inc(rcu_torture_count[pipe_count]); completed = completed - started; @@ -1333,7 +1322,7 @@ rcu_torture_stats_print(void) rcu_torture_writer_state, gpnum, completed, flags); show_rcu_gp_kthreads(); - rcutorture_trace_dump(); + rcu_ftrace_dump(DUMP_ALL); } rtcv_snap = rcu_torture_current_version; } @@ -1489,7 +1478,9 @@ static int rcu_torture_barrier_cbs(void *arg) * The above smp_load_acquire() ensures barrier_phase load * is ordered before the folloiwng ->call(). */ + local_irq_disable(); /* Just to test no-irq call_rcu(). */ cur_ops->call(&rcu, rcu_torture_barrier_cbf); + local_irq_enable(); if (atomic_dec_and_test(&barrier_cbs_count)) wake_up(&barrier_wq); } while (!torture_must_stop()); @@ -1596,7 +1587,7 @@ static int rcutorture_cpu_notify(struct notifier_block *self, { long cpu = (long)hcpu; - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: case CPU_DOWN_FAILED: (void)rcutorture_booster_init(cpu); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9a535a86e732..c7f1bc4f817c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -102,6 +102,8 @@ struct rcu_state sname##_state = { \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ + .exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \ + .exp_wake_mutex = __MUTEX_INITIALIZER(sname##_state.exp_wake_mutex), \ } RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); @@ -370,6 +372,21 @@ void rcu_all_qs(void) rcu_momentary_dyntick_idle(); local_irq_restore(flags); } + if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) { + /* + * Yes, we just checked a per-CPU variable with preemption + * enabled, so we might be migrated to some other CPU at + * this point. That is OK because in that case, the + * migration will supply the needed quiescent state. + * We might end up needlessly disabling preemption and + * invoking rcu_sched_qs() on the destination CPU, but + * the probability and cost are both quite low, so this + * should not be a problem in practice. + */ + preempt_disable(); + rcu_sched_qs(); + preempt_enable(); + } this_cpu_inc(rcu_qs_ctr); barrier(); /* Avoid RCU read-side critical sections leaking up. */ } @@ -385,9 +402,11 @@ module_param(qlowmark, long, 0444); static ulong jiffies_till_first_fqs = ULONG_MAX; static ulong jiffies_till_next_fqs = ULONG_MAX; +static bool rcu_kick_kthreads; module_param(jiffies_till_first_fqs, ulong, 0644); module_param(jiffies_till_next_fqs, ulong, 0644); +module_param(rcu_kick_kthreads, bool, 0644); /* * How long the grace period must be before we start recruiting @@ -460,6 +479,28 @@ unsigned long rcu_batches_completed_bh(void) EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); /* + * Return the number of RCU expedited batches completed thus far for + * debug & stats. Odd numbers mean that a batch is in progress, even + * numbers mean idle. The value returned will thus be roughly double + * the cumulative batches since boot. + */ +unsigned long rcu_exp_batches_completed(void) +{ + return rcu_state_p->expedited_sequence; +} +EXPORT_SYMBOL_GPL(rcu_exp_batches_completed); + +/* + * Return the number of RCU-sched expedited batches completed thus far + * for debug & stats. Similar to rcu_exp_batches_completed(). + */ +unsigned long rcu_exp_batches_completed_sched(void) +{ + return rcu_sched_state.expedited_sequence; +} +EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched); + +/* * Force a quiescent state. */ void rcu_force_quiescent_state(void) @@ -637,7 +678,7 @@ static void rcu_eqs_enter_common(long long oldval, bool user) idle_task(smp_processor_id()); trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); - ftrace_dump(DUMP_ORIG); + rcu_ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ @@ -799,7 +840,7 @@ static void rcu_eqs_exit_common(long long oldval, int user) trace_rcu_dyntick(TPS("Error on exit: not idle task"), oldval, rdtp->dynticks_nesting); - ftrace_dump(DUMP_ORIG); + rcu_ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ @@ -1224,8 +1265,10 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) rsp->gp_flags, gp_state_getname(rsp->gp_state), rsp->gp_state, rsp->gp_kthread ? rsp->gp_kthread->state : ~0); - if (rsp->gp_kthread) + if (rsp->gp_kthread) { sched_show_task(rsp->gp_kthread); + wake_up_process(rsp->gp_kthread); + } } } @@ -1249,6 +1292,25 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) } } +/* + * If too much time has passed in the current grace period, and if + * so configured, go kick the relevant kthreads. + */ +static void rcu_stall_kick_kthreads(struct rcu_state *rsp) +{ + unsigned long j; + + if (!rcu_kick_kthreads) + return; + j = READ_ONCE(rsp->jiffies_kick_kthreads); + if (time_after(jiffies, j) && rsp->gp_kthread) { + WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name); + rcu_ftrace_dump(DUMP_ALL); + wake_up_process(rsp->gp_kthread); + WRITE_ONCE(rsp->jiffies_kick_kthreads, j + HZ); + } +} + static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) { int cpu; @@ -1260,6 +1322,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; + /* Kick and suppress, if so configured. */ + rcu_stall_kick_kthreads(rsp); + if (rcu_cpu_stall_suppress) + return; + /* Only let one CPU complain about others per time interval. */ raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -1333,6 +1400,11 @@ static void print_cpu_stall(struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; + /* Kick and suppress, if so configured. */ + rcu_stall_kick_kthreads(rsp); + if (rcu_cpu_stall_suppress) + return; + /* * OK, time to rat on ourselves... * See Documentation/RCU/stallwarn.txt for info on how to debug @@ -1377,8 +1449,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) unsigned long js; struct rcu_node *rnp; - if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp)) + if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) || + !rcu_gp_in_progress(rsp)) return; + rcu_stall_kick_kthreads(rsp); j = jiffies; /* @@ -2117,8 +2191,11 @@ static int __noreturn rcu_gp_kthread(void *arg) } ret = 0; for (;;) { - if (!ret) + if (!ret) { rsp->jiffies_force_qs = jiffies + j; + WRITE_ONCE(rsp->jiffies_kick_kthreads, + jiffies + 3 * j); + } trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("fqswait")); @@ -2144,6 +2221,15 @@ static int __noreturn rcu_gp_kthread(void *arg) TPS("fqsend")); cond_resched_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); + ret = 0; /* Force full wait till next FQS. */ + j = jiffies_till_next_fqs; + if (j > HZ) { + j = HZ; + jiffies_till_next_fqs = HZ; + } else if (j < 1) { + j = 1; + jiffies_till_next_fqs = 1; + } } else { /* Deal with stray signal. */ cond_resched_rcu_qs(); @@ -2152,14 +2238,12 @@ static int __noreturn rcu_gp_kthread(void *arg) trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("fqswaitsig")); - } - j = jiffies_till_next_fqs; - if (j > HZ) { - j = HZ; - jiffies_till_next_fqs = HZ; - } else if (j < 1) { - j = 1; - jiffies_till_next_fqs = 1; + ret = 1; /* Keep old FQS timing. */ + j = jiffies; + if (time_after(jiffies, rsp->jiffies_force_qs)) + j = 1; + else + j = rsp->jiffies_force_qs - j; } } @@ -3376,8 +3460,12 @@ static void rcu_exp_gp_seq_end(struct rcu_state *rsp) } static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) { + unsigned long s; + smp_mb(); /* Caller's modifications seen first by other CPUs. */ - return rcu_seq_snap(&rsp->expedited_sequence); + s = rcu_seq_snap(&rsp->expedited_sequence); + trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); + return s; } static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) { @@ -3469,7 +3557,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) * for the current expedited grace period. Works only for preemptible * RCU -- other RCU implementation use other means. * - * Caller must hold the root rcu_node's exp_funnel_mutex. + * Caller must hold the rcu_state's exp_mutex. */ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) { @@ -3485,8 +3573,8 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) * - * Caller must hold the root rcu_node's exp_funnel_mutex and the - * specified rcu_node structure's ->lock. + * Caller must hold the rcu_state's exp_mutex and the specified rcu_node + * structure's ->lock. */ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake, unsigned long flags) @@ -3523,7 +3611,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, * Report expedited quiescent state for specified node. This is a * lock-acquisition wrapper function for __rcu_report_exp_rnp(). * - * Caller must hold the root rcu_node's exp_funnel_mutex. + * Caller must hold the rcu_state's exp_mutex. */ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake) @@ -3536,8 +3624,8 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, /* * Report expedited quiescent state for multiple CPUs, all covered by the - * specified leaf rcu_node structure. Caller must hold the root - * rcu_node's exp_funnel_mutex. + * specified leaf rcu_node structure. Caller must hold the rcu_state's + * exp_mutex. */ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, unsigned long mask, bool wake) @@ -3555,7 +3643,6 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, /* * Report expedited quiescent state for specified rcu_data (CPU). - * Caller must hold the root rcu_node's exp_funnel_mutex. */ static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, bool wake) @@ -3564,15 +3651,11 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, } /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ -static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp, - atomic_long_t *stat, unsigned long s) +static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat, + unsigned long s) { if (rcu_exp_gp_seq_done(rsp, s)) { - if (rnp) - mutex_unlock(&rnp->exp_funnel_mutex); - else if (rdp) - mutex_unlock(&rdp->exp_funnel_mutex); + trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); /* Ensure test happens before caller kfree(). */ smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(stat); @@ -3582,59 +3665,65 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, } /* - * Funnel-lock acquisition for expedited grace periods. Returns a - * pointer to the root rcu_node structure, or NULL if some other - * task did the expedited grace period for us. + * Funnel-lock acquisition for expedited grace periods. Returns true + * if some other task completed an expedited grace period that this task + * can piggy-back on, and with no mutex held. Otherwise, returns false + * with the mutex held, indicating that the caller must actually do the + * expedited grace period. */ -static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) +static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) { struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); - struct rcu_node *rnp0; - struct rcu_node *rnp1 = NULL; + struct rcu_node *rnp = rdp->mynode; + struct rcu_node *rnp_root = rcu_get_root(rsp); + + /* Low-contention fastpath. */ + if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && + (rnp == rnp_root || + ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && + !mutex_is_locked(&rsp->exp_mutex) && + mutex_trylock(&rsp->exp_mutex)) + goto fastpath; /* - * First try directly acquiring the root lock in order to reduce - * latency in the common case where expedited grace periods are - * rare. We check mutex_is_locked() to avoid pathological levels of - * memory contention on ->exp_funnel_mutex in the heavy-load case. + * Each pass through the following loop works its way up + * the rcu_node tree, returning if others have done the work or + * otherwise falls through to acquire rsp->exp_mutex. The mapping + * from CPU to rcu_node structure can be inexact, as it is just + * promoting locality and is not strictly needed for correctness. */ - rnp0 = rcu_get_root(rsp); - if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) { - if (mutex_trylock(&rnp0->exp_funnel_mutex)) { - if (sync_exp_work_done(rsp, rnp0, NULL, - &rdp->expedited_workdone0, s)) - return NULL; - return rnp0; + for (; rnp != NULL; rnp = rnp->parent) { + if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s)) + return true; + + /* Work not done, either wait here or go up. */ + spin_lock(&rnp->exp_lock); + if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) { + + /* Someone else doing GP, so wait for them. */ + spin_unlock(&rnp->exp_lock); + trace_rcu_exp_funnel_lock(rsp->name, rnp->level, + rnp->grplo, rnp->grphi, + TPS("wait")); + wait_event(rnp->exp_wq[(s >> 1) & 0x3], + sync_exp_work_done(rsp, + &rdp->exp_workdone2, s)); + return true; } + rnp->exp_seq_rq = s; /* Followers can wait on us. */ + spin_unlock(&rnp->exp_lock); + trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, + rnp->grphi, TPS("nxtlvl")); } - - /* - * Each pass through the following loop works its way - * up the rcu_node tree, returning if others have done the - * work or otherwise falls through holding the root rnp's - * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure - * can be inexact, as it is just promoting locality and is not - * strictly needed for correctness. - */ - if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s)) - return NULL; - mutex_lock(&rdp->exp_funnel_mutex); - rnp0 = rdp->mynode; - for (; rnp0 != NULL; rnp0 = rnp0->parent) { - if (sync_exp_work_done(rsp, rnp1, rdp, - &rdp->expedited_workdone2, s)) - return NULL; - mutex_lock(&rnp0->exp_funnel_mutex); - if (rnp1) - mutex_unlock(&rnp1->exp_funnel_mutex); - else - mutex_unlock(&rdp->exp_funnel_mutex); - rnp1 = rnp0; + mutex_lock(&rsp->exp_mutex); +fastpath: + if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { + mutex_unlock(&rsp->exp_mutex); + return true; } - if (sync_exp_work_done(rsp, rnp1, rdp, - &rdp->expedited_workdone3, s)) - return NULL; - return rnp1; + rcu_exp_gp_seq_start(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); + return false; } /* Invoked on each online non-idle CPU for expedited quiescent state. */ @@ -3649,6 +3738,11 @@ static void sync_sched_exp_handler(void *data) if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) return; + if (rcu_is_cpu_rrupt_from_idle()) { + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(&rcu_sched_data), true); + return; + } __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); resched_cpu(smp_processor_id()); } @@ -3773,7 +3867,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) rsp->name); ndetected = 0; rcu_for_each_leaf_node(rsp, rnp) { - ndetected = rcu_print_task_exp_stall(rnp); + ndetected += rcu_print_task_exp_stall(rnp); mask = 1; for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { struct rcu_data *rdp; @@ -3783,7 +3877,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) ndetected++; rdp = per_cpu_ptr(rsp->rda, cpu); pr_cont(" %d-%c%c%c", cpu, - "O."[cpu_online(cpu)], + "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rnp->expmaskinit)], "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); } @@ -3792,7 +3886,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", jiffies - jiffies_start, rsp->expedited_sequence, rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); - if (!ndetected) { + if (ndetected) { pr_err("blocking rcu_node structures:"); rcu_for_each_node_breadth_first(rsp, rnp) { if (rnp == rnp_root) @@ -3818,6 +3912,41 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) } } +/* + * Wait for the current expedited grace period to complete, and then + * wake up everyone who piggybacked on the just-completed expedited + * grace period. Also update all the ->exp_seq_rq counters as needed + * in order to avoid counter-wrap problems. + */ +static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) +{ + struct rcu_node *rnp; + + synchronize_sched_expedited_wait(rsp); + rcu_exp_gp_seq_end(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); + + /* + * Switch over to wakeup mode, allowing the next GP, but -only- the + * next GP, to proceed. + */ + mutex_lock(&rsp->exp_wake_mutex); + mutex_unlock(&rsp->exp_mutex); + + rcu_for_each_node_breadth_first(rsp, rnp) { + if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { + spin_lock(&rnp->exp_lock); + /* Recheck, avoid hang in case someone just arrived. */ + if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) + rnp->exp_seq_rq = s; + spin_unlock(&rnp->exp_lock); + } + wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); + } + trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); + mutex_unlock(&rsp->exp_wake_mutex); +} + /** * synchronize_sched_expedited - Brute-force RCU-sched grace period * @@ -3837,7 +3966,6 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) void synchronize_sched_expedited(void) { unsigned long s; - struct rcu_node *rnp; struct rcu_state *rsp = &rcu_sched_state; /* If only one CPU, this is automatically a grace period. */ @@ -3852,17 +3980,14 @@ void synchronize_sched_expedited(void) /* Take a snapshot of the sequence number. */ s = rcu_exp_gp_seq_snap(rsp); - - rnp = exp_funnel_lock(rsp, s); - if (rnp == NULL) + if (exp_funnel_lock(rsp, s)) return; /* Someone else did our work for us. */ - rcu_exp_gp_seq_start(rsp); + /* Initialize the rcu_node tree in preparation for the wait. */ sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); - synchronize_sched_expedited_wait(rsp); - rcu_exp_gp_seq_end(rsp); - mutex_unlock(&rnp->exp_funnel_mutex); + /* Wait and clean up, including waking everyone. */ + rcu_exp_wait_wake(rsp, s); } EXPORT_SYMBOL_GPL(synchronize_sched_expedited); @@ -4162,7 +4287,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); rdp->cpu = cpu; rdp->rsp = rsp; - mutex_init(&rdp->exp_funnel_mutex); rcu_boot_init_nocb_percpu_data(rdp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } @@ -4420,10 +4544,8 @@ static void __init rcu_init_one(struct rcu_state *rsp) { static const char * const buf[] = RCU_NODE_NAME_INIT; static const char * const fqs[] = RCU_FQS_NAME_INIT; - static const char * const exp[] = RCU_EXP_NAME_INIT; static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; - static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; static u8 fl_mask = 0x1; int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ @@ -4482,9 +4604,11 @@ static void __init rcu_init_one(struct rcu_state *rsp) rnp->level = i; INIT_LIST_HEAD(&rnp->blkd_tasks); rcu_init_one_nocb(rnp); - mutex_init(&rnp->exp_funnel_mutex); - lockdep_set_class_and_name(&rnp->exp_funnel_mutex, - &rcu_exp_class[i], exp[i]); + init_waitqueue_head(&rnp->exp_wq[0]); + init_waitqueue_head(&rnp->exp_wq[1]); + init_waitqueue_head(&rnp->exp_wq[2]); + init_waitqueue_head(&rnp->exp_wq[3]); + spin_lock_init(&rnp->exp_lock); } } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index df668c0f9e64..e3959f5e6ddf 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -70,7 +70,6 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } # define RCU_NODE_NAME_INIT { "rcu_node_0" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } -# define RCU_EXP_NAME_INIT { "rcu_node_exp_0" } #elif NR_CPUS <= RCU_FANOUT_2 # define RCU_NUM_LVLS 2 # define NUM_RCU_LVL_0 1 @@ -79,7 +78,6 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } -# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" } #elif NR_CPUS <= RCU_FANOUT_3 # define RCU_NUM_LVLS 3 # define NUM_RCU_LVL_0 1 @@ -89,7 +87,6 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } -# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" } #elif NR_CPUS <= RCU_FANOUT_4 # define RCU_NUM_LVLS 4 # define NUM_RCU_LVL_0 1 @@ -100,7 +97,6 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } -# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" } #else # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ @@ -252,7 +248,9 @@ struct rcu_node { /* Counts of upcoming no-CB GP requests. */ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; - struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp; + spinlock_t exp_lock ____cacheline_internodealigned_in_smp; + unsigned long exp_seq_rq; + wait_queue_head_t exp_wq[4]; } ____cacheline_internodealigned_in_smp; /* @@ -387,11 +385,9 @@ struct rcu_data { #ifdef CONFIG_RCU_FAST_NO_HZ struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ - struct mutex exp_funnel_mutex; - atomic_long_t expedited_workdone0; /* # done by others #0. */ - atomic_long_t expedited_workdone1; /* # done by others #1. */ - atomic_long_t expedited_workdone2; /* # done by others #2. */ - atomic_long_t expedited_workdone3; /* # done by others #3. */ + atomic_long_t exp_workdone1; /* # done by others #1. */ + atomic_long_t exp_workdone2; /* # done by others #2. */ + atomic_long_t exp_workdone3; /* # done by others #3. */ /* 7) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU @@ -505,6 +501,8 @@ struct rcu_state { /* _rcu_barrier(). */ /* End of fields guarded by barrier_mutex. */ + struct mutex exp_mutex; /* Serialize expedited GP. */ + struct mutex exp_wake_mutex; /* Serialize wakeup. */ unsigned long expedited_sequence; /* Take a ticket. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */ atomic_t expedited_need_qs; /* # CPUs left to check in. */ @@ -513,6 +511,8 @@ struct rcu_state { unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ + unsigned long jiffies_kick_kthreads; /* Time at which to kick */ + /* kthreads, if configured. */ unsigned long n_force_qs; /* Number of calls to */ /* force_quiescent_state(). */ unsigned long n_force_qs_lh; /* ~Number of calls leaving */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index efdf7b61ce12..ff1cd4e1188d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -722,18 +722,22 @@ static void sync_rcu_exp_handler(void *info) * synchronize_rcu_expedited - Brute-force RCU grace period * * Wait for an RCU-preempt grace period, but expedite it. The basic - * idea is to invoke synchronize_sched_expedited() to push all the tasks to - * the ->blkd_tasks lists and wait for this list to drain. This consumes - * significant time on all CPUs and is unfriendly to real-time workloads, - * so is thus not recommended for any sort of common-case code. - * In fact, if you are using synchronize_rcu_expedited() in a loop, - * please restructure your code to batch your updates, and then Use a - * single synchronize_rcu() instead. + * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler + * checks whether the CPU is in an RCU-preempt critical section, and + * if so, it sets a flag that causes the outermost rcu_read_unlock() + * to report the quiescent state. On the other hand, if the CPU is + * not in an RCU read-side critical section, the IPI handler reports + * the quiescent state immediately. + * + * Although this is a greate improvement over previous expedited + * implementations, it is still unfriendly to real-time workloads, so is + * thus not recommended for any sort of common-case code. In fact, if + * you are using synchronize_rcu_expedited() in a loop, please restructure + * your code to batch your updates, and then Use a single synchronize_rcu() + * instead. */ void synchronize_rcu_expedited(void) { - struct rcu_node *rnp; - struct rcu_node *rnp_unlock; struct rcu_state *rsp = rcu_state_p; unsigned long s; @@ -744,23 +748,14 @@ void synchronize_rcu_expedited(void) } s = rcu_exp_gp_seq_snap(rsp); - - rnp_unlock = exp_funnel_lock(rsp, s); - if (rnp_unlock == NULL) + if (exp_funnel_lock(rsp, s)) return; /* Someone else did our work for us. */ - rcu_exp_gp_seq_start(rsp); - /* Initialize the rcu_node tree in preparation for the wait. */ sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); - /* Wait for snapshotted ->blkd_tasks lists to drain. */ - rnp = rcu_get_root(rsp); - synchronize_sched_expedited_wait(rsp); - - /* Clean up and exit. */ - rcu_exp_gp_seq_end(rsp); - mutex_unlock(&rnp_unlock->exp_funnel_mutex); + /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ + rcu_exp_wait_wake(rsp, s); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 1088e64f01ad..86782f9a4604 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -185,17 +185,16 @@ static int show_rcuexp(struct seq_file *m, void *v) int cpu; struct rcu_state *rsp = (struct rcu_state *)m->private; struct rcu_data *rdp; - unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0; + unsigned long s1 = 0, s2 = 0, s3 = 0; for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(rsp->rda, cpu); - s0 += atomic_long_read(&rdp->expedited_workdone0); - s1 += atomic_long_read(&rdp->expedited_workdone1); - s2 += atomic_long_read(&rdp->expedited_workdone2); - s3 += atomic_long_read(&rdp->expedited_workdone3); + s1 += atomic_long_read(&rdp->exp_workdone1); + s2 += atomic_long_read(&rdp->exp_workdone2); + s3 += atomic_long_read(&rdp->exp_workdone3); } - seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", - rsp->expedited_sequence, s0, s1, s2, s3, + seq_printf(m, "s=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", + rsp->expedited_sequence, s1, s2, s3, atomic_long_read(&rsp->expedited_normal), atomic_read(&rsp->expedited_need_qs), rsp->expedited_sequence / 2); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index ca828b41c938..3ccdc8eebc5a 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -67,7 +67,7 @@ static int rcu_normal_after_boot; module_param(rcu_normal_after_boot, int, 0); #endif /* #ifndef CONFIG_TINY_RCU */ -#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT) +#ifdef CONFIG_DEBUG_LOCK_ALLOC /** * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? * @@ -111,7 +111,7 @@ int rcu_read_lock_sched_held(void) return 0; if (debug_locks) lockdep_opinion = lock_is_held(&rcu_sched_lock_map); - return lockdep_opinion || preempt_count() != 0 || irqs_disabled(); + return lockdep_opinion || !preemptible(); } EXPORT_SYMBOL(rcu_read_lock_sched_held); #endif diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index fedb967a9841..e85a725e5c34 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -318,6 +318,7 @@ u64 sched_clock_cpu(int cpu) return clock; } +EXPORT_SYMBOL_GPL(sched_clock_cpu); void sched_clock_tick(void) { @@ -363,39 +364,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -/* - * As outlined at the top, provides a fast, high resolution, nanosecond - * time source that is monotonic per cpu argument and has bounded drift - * between cpus. - * - * ######################### BIG FAT WARNING ########################## - * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # - * # go backwards !! # - * #################################################################### - */ -u64 cpu_clock(int cpu) -{ - if (!sched_clock_stable()) - return sched_clock_cpu(cpu); - - return sched_clock(); -} - -/* - * Similar to cpu_clock() for the current cpu. Time will only be observed - * to be monotonic if care is taken to only compare timestampt taken on the - * same CPU. - * - * See cpu_clock(). - */ -u64 local_clock(void) -{ - if (!sched_clock_stable()) - return sched_clock_cpu(raw_smp_processor_id()); - - return sched_clock(); -} - #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ void sched_clock_init(void) @@ -410,22 +378,8 @@ u64 sched_clock_cpu(int cpu) return sched_clock(); } - -u64 cpu_clock(int cpu) -{ - return sched_clock(); -} - -u64 local_clock(void) -{ - return sched_clock(); -} - #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ -EXPORT_SYMBOL_GPL(cpu_clock); -EXPORT_SYMBOL_GPL(local_clock); - /* * Running clock - returns the time that has elapsed while a guest has been * running. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d1f7149f8704..404c0784b1fc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -33,7 +33,7 @@ #include <linux/init.h> #include <linux/uaccess.h> #include <linux/highmem.h> -#include <asm/mmu_context.h> +#include <linux/mmu_context.h> #include <linux/interrupt.h> #include <linux/capability.h> #include <linux/completion.h> @@ -170,6 +170,71 @@ static struct rq *this_rq_lock(void) return rq; } +/* + * __task_rq_lock - lock the rq @p resides on. + */ +struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(rq->lock) +{ + struct rq *rq; + + lockdep_assert_held(&p->pi_lock); + + for (;;) { + rq = task_rq(p); + raw_spin_lock(&rq->lock); + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { + rf->cookie = lockdep_pin_lock(&rq->lock); + return rq; + } + raw_spin_unlock(&rq->lock); + + while (unlikely(task_on_rq_migrating(p))) + cpu_relax(); + } +} + +/* + * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. + */ +struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(p->pi_lock) + __acquires(rq->lock) +{ + struct rq *rq; + + for (;;) { + raw_spin_lock_irqsave(&p->pi_lock, rf->flags); + rq = task_rq(p); + raw_spin_lock(&rq->lock); + /* + * move_queued_task() task_rq_lock() + * + * ACQUIRE (rq->lock) + * [S] ->on_rq = MIGRATING [L] rq = task_rq() + * WMB (__set_task_cpu()) ACQUIRE (rq->lock); + * [S] ->cpu = new_cpu [L] task_rq() + * [L] ->on_rq + * RELEASE (rq->lock) + * + * If we observe the old cpu in task_rq_lock, the acquire of + * the old rq->lock will fully serialize against the stores. + * + * If we observe the new cpu in task_rq_lock, the acquire will + * pair with the WMB to ensure we must then also see migrating. + */ + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { + rf->cookie = lockdep_pin_lock(&rq->lock); + return rq; + } + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); + + while (unlikely(task_on_rq_migrating(p))) + cpu_relax(); + } +} + #ifdef CONFIG_SCHED_HRTICK /* * Use HR-timers to deliver accurate preemption points. @@ -249,29 +314,6 @@ void hrtick_start(struct rq *rq, u64 delay) } } -static int -hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int cpu = (int)(long)hcpu; - - switch (action) { - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - hrtick_clear(cpu_rq(cpu)); - return NOTIFY_OK; - } - - return NOTIFY_DONE; -} - -static __init void init_hrtick(void) -{ - hotcpu_notifier(hotplug_hrtick, 0); -} #else /* * Called to set the hrtick timer state. @@ -288,10 +330,6 @@ void hrtick_start(struct rq *rq, u64 delay) hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL_PINNED); } - -static inline void init_hrtick(void) -{ -} #endif /* CONFIG_SMP */ static void init_rq_hrtick(struct rq *rq) @@ -315,10 +353,6 @@ static inline void hrtick_clear(struct rq *rq) static inline void init_rq_hrtick(struct rq *rq) { } - -static inline void init_hrtick(void) -{ -} #endif /* CONFIG_SCHED_HRTICK */ /* @@ -400,7 +434,7 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) * wakeup due to that. * * This cmpxchg() implies a full barrier, which pairs with the write - * barrier implied by the wakeup in wake_up_list(). + * barrier implied by the wakeup in wake_up_q(). */ if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) return; @@ -499,7 +533,10 @@ int get_nohz_timer_target(void) rcu_read_lock(); for_each_domain(cpu, sd) { for_each_cpu(i, sched_domain_span(sd)) { - if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) { + if (cpu == i) + continue; + + if (!idle_cpu(i) && is_housekeeping_cpu(i)) { cpu = i; goto unlock; } @@ -1085,12 +1122,20 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) static int __set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask, bool check) { - unsigned long flags; - struct rq *rq; + const struct cpumask *cpu_valid_mask = cpu_active_mask; unsigned int dest_cpu; + struct rq_flags rf; + struct rq *rq; int ret = 0; - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); + + if (p->flags & PF_KTHREAD) { + /* + * Kernel threads are allowed on online && !active CPUs + */ + cpu_valid_mask = cpu_online_mask; + } /* * Must re-check here, to close a race against __kthread_bind(), @@ -1104,22 +1149,32 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_equal(&p->cpus_allowed, new_mask)) goto out; - if (!cpumask_intersects(new_mask, cpu_active_mask)) { + if (!cpumask_intersects(new_mask, cpu_valid_mask)) { ret = -EINVAL; goto out; } do_set_cpus_allowed(p, new_mask); + if (p->flags & PF_KTHREAD) { + /* + * For kernel threads that do indeed end up on online && + * !active we want to ensure they are strict per-cpu threads. + */ + WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && + !cpumask_intersects(new_mask, cpu_active_mask) && + p->nr_cpus_allowed != 1); + } + /* Can the task run on the task's current CPU? If so, we're done */ if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); + dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); tlb_migrate_finish(p->mm); return 0; @@ -1128,12 +1183,12 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, * OK, since we're going to drop the lock immediately * afterwards anyway. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, rf.cookie); rq = move_queued_task(rq, p, dest_cpu); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, rf.cookie); } out: - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return ret; } @@ -1317,8 +1372,8 @@ out: */ unsigned long wait_task_inactive(struct task_struct *p, long match_state) { - unsigned long flags; int running, queued; + struct rq_flags rf; unsigned long ncsw; struct rq *rq; @@ -1353,14 +1408,14 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * lock now, to be *sure*. If we're wrong, we'll * just go back and repeat. */ - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); trace_sched_wait_task(p); running = task_running(rq, p); queued = task_on_rq_queued(p); ncsw = 0; if (!match_state || p->state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); /* * If it changed from the expected state, bail out now. @@ -1434,6 +1489,25 @@ EXPORT_SYMBOL_GPL(kick_process); /* * ->cpus_allowed is protected by both rq->lock and p->pi_lock + * + * A few notes on cpu_active vs cpu_online: + * + * - cpu_active must be a subset of cpu_online + * + * - on cpu-up we allow per-cpu kthreads on the online && !active cpu, + * see __set_cpus_allowed_ptr(). At this point the newly online + * cpu isn't yet part of the sched domains, and balancing will not + * see it. + * + * - on cpu-down we clear cpu_active() to mask the sched domains and + * avoid the load balancer to place new tasks on the to be removed + * cpu. Existing tasks will remain running there and will be taken + * off. + * + * This means that fallback selection must not select !active CPUs. + * And can assume that any active CPU must be online. Conversely + * select_task_rq() below may allow selection of !active CPUs in order + * to satisfy the above rules. */ static int select_fallback_rq(int cpu, struct task_struct *p) { @@ -1452,8 +1526,6 @@ static int select_fallback_rq(int cpu, struct task_struct *p) /* Look for allowed, online CPU in same node. */ for_each_cpu(dest_cpu, nodemask) { - if (!cpu_online(dest_cpu)) - continue; if (!cpu_active(dest_cpu)) continue; if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) @@ -1464,8 +1536,6 @@ static int select_fallback_rq(int cpu, struct task_struct *p) for (;;) { /* Any allowed, online CPU? */ for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { - if (!cpu_online(dest_cpu)) - continue; if (!cpu_active(dest_cpu)) continue; goto out; @@ -1515,8 +1585,10 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { lockdep_assert_held(&p->pi_lock); - if (p->nr_cpus_allowed > 1) + if (tsk_nr_cpus_allowed(p) > 1) cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); + else + cpu = cpumask_any(tsk_cpus_allowed(p)); /* * In order not to call set_task_cpu() on a blocking task we need @@ -1604,8 +1676,8 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl /* * Mark the task runnable and perform wakeup-preemption. */ -static void -ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) +static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, + struct pin_cookie cookie) { check_preempt_curr(rq, p, wake_flags); p->state = TASK_RUNNING; @@ -1617,9 +1689,9 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) * Our task @p is fully woken up and running; so its safe to * drop the rq->lock, hereafter rq is only used for statistics. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); p->sched_class->task_woken(rq, p); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, cookie); } if (rq->idle_stamp) { @@ -1637,17 +1709,23 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) } static void -ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, + struct pin_cookie cookie) { + int en_flags = ENQUEUE_WAKEUP; + lockdep_assert_held(&rq->lock); #ifdef CONFIG_SMP if (p->sched_contributes_to_load) rq->nr_uninterruptible--; + + if (wake_flags & WF_MIGRATED) + en_flags |= ENQUEUE_MIGRATED; #endif - ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); - ttwu_do_wakeup(rq, p, wake_flags); + ttwu_activate(rq, p, en_flags); + ttwu_do_wakeup(rq, p, wake_flags, cookie); } /* @@ -1658,17 +1736,18 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) */ static int ttwu_remote(struct task_struct *p, int wake_flags) { + struct rq_flags rf; struct rq *rq; int ret = 0; - rq = __task_rq_lock(p); + rq = __task_rq_lock(p, &rf); if (task_on_rq_queued(p)) { /* check_preempt_curr() may use rq clock */ update_rq_clock(rq); - ttwu_do_wakeup(rq, p, wake_flags); + ttwu_do_wakeup(rq, p, wake_flags, rf.cookie); ret = 1; } - __task_rq_unlock(rq); + __task_rq_unlock(rq, &rf); return ret; } @@ -1678,6 +1757,7 @@ void sched_ttwu_pending(void) { struct rq *rq = this_rq(); struct llist_node *llist = llist_del_all(&rq->wake_list); + struct pin_cookie cookie; struct task_struct *p; unsigned long flags; @@ -1685,15 +1765,19 @@ void sched_ttwu_pending(void) return; raw_spin_lock_irqsave(&rq->lock, flags); - lockdep_pin_lock(&rq->lock); + cookie = lockdep_pin_lock(&rq->lock); while (llist) { p = llist_entry(llist, struct task_struct, wake_entry); llist = llist_next(llist); - ttwu_do_activate(rq, p, 0); + /* + * See ttwu_queue(); we only call ttwu_queue_remote() when + * its a x-cpu wakeup. + */ + ttwu_do_activate(rq, p, WF_MIGRATED, cookie); } - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -1777,9 +1861,10 @@ bool cpus_share_cache(int this_cpu, int that_cpu) } #endif /* CONFIG_SMP */ -static void ttwu_queue(struct task_struct *p, int cpu) +static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); + struct pin_cookie cookie; #if defined(CONFIG_SMP) if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { @@ -1790,9 +1875,9 @@ static void ttwu_queue(struct task_struct *p, int cpu) #endif raw_spin_lock(&rq->lock); - lockdep_pin_lock(&rq->lock); - ttwu_do_activate(rq, p, 0); - lockdep_unpin_lock(&rq->lock); + cookie = lockdep_pin_lock(&rq->lock); + ttwu_do_activate(rq, p, wake_flags, cookie); + lockdep_unpin_lock(&rq->lock, cookie); raw_spin_unlock(&rq->lock); } @@ -1961,9 +2046,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; - if (p->sched_class->task_waking) - p->sched_class->task_waking(p); - cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; @@ -1971,7 +2053,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) } #endif /* CONFIG_SMP */ - ttwu_queue(p, cpu); + ttwu_queue(p, cpu, wake_flags); stat: if (schedstat_enabled()) ttwu_stat(p, cpu, wake_flags); @@ -1989,7 +2071,7 @@ out: * ensure that this_rq() is locked, @p is bound to this_rq() and not * the current task. */ -static void try_to_wake_up_local(struct task_struct *p) +static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie) { struct rq *rq = task_rq(p); @@ -2006,11 +2088,11 @@ static void try_to_wake_up_local(struct task_struct *p) * disabled avoiding further scheduler activity on it and we've * not yet picked a replacement task. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); raw_spin_unlock(&rq->lock); raw_spin_lock(&p->pi_lock); raw_spin_lock(&rq->lock); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, cookie); } if (!(p->state & TASK_NORMAL)) @@ -2021,7 +2103,7 @@ static void try_to_wake_up_local(struct task_struct *p) if (!task_on_rq_queued(p)) ttwu_activate(rq, p, ENQUEUE_WAKEUP); - ttwu_do_wakeup(rq, p, 0); + ttwu_do_wakeup(rq, p, 0, cookie); if (schedstat_enabled()) ttwu_stat(p, smp_processor_id(), 0); out: @@ -2381,7 +2463,8 @@ static int dl_overflow(struct task_struct *p, int policy, u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; int cpus, err = -1; - if (new_bw == p->dl.dl_bw) + /* !deadline task may carry old deadline bandwidth */ + if (new_bw == p->dl.dl_bw && task_has_dl_policy(p)) return 0; /* @@ -2420,12 +2503,12 @@ extern void init_dl_bw(struct dl_bw *dl_b); */ void wake_up_new_task(struct task_struct *p) { - unsigned long flags; + struct rq_flags rf; struct rq *rq; - raw_spin_lock_irqsave(&p->pi_lock, flags); /* Initialize new task's runnable average */ init_entity_runnable_average(&p->se); + raw_spin_lock_irqsave(&p->pi_lock, rf.flags); #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: @@ -2434,8 +2517,10 @@ void wake_up_new_task(struct task_struct *p) */ set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif + /* Post initialize new task's util average when its cfs_rq is set */ + post_init_entity_util_avg(&p->se); - rq = __task_rq_lock(p); + rq = __task_rq_lock(p, &rf); activate_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); @@ -2446,12 +2531,12 @@ void wake_up_new_task(struct task_struct *p) * Nothing relies on rq->lock after this, so its fine to * drop it. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, rf.cookie); p->sched_class->task_woken(rq, p); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, rf.cookie); } #endif - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); } #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -2713,7 +2798,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) */ static __always_inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, - struct task_struct *next) + struct task_struct *next, struct pin_cookie cookie) { struct mm_struct *mm, *oldmm; @@ -2733,7 +2818,7 @@ context_switch(struct rq *rq, struct task_struct *prev, atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else - switch_mm(oldmm, mm, next); + switch_mm_irqs_off(oldmm, mm, next); if (!prev->mm) { prev->active_mm = NULL; @@ -2745,7 +2830,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * of the scheduler it's an obvious special-case), so we * do an early lockdep release here: */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); spin_release(&rq->lock.dep_map, 1, _THIS_IP_); /* Here we just switch the register state and the stack. */ @@ -2867,7 +2952,7 @@ EXPORT_PER_CPU_SYMBOL(kernel_cpustat); */ unsigned long long task_sched_runtime(struct task_struct *p) { - unsigned long flags; + struct rq_flags rf; struct rq *rq; u64 ns; @@ -2887,7 +2972,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) return p->se.sum_exec_runtime; #endif - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); /* * Must be ->curr _and_ ->on_rq. If dequeued, we would * project cycles that may never be accounted to this @@ -2898,7 +2983,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) p->sched_class->update_curr(rq); } ns = p->se.sum_exec_runtime; - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return ns; } @@ -2918,7 +3003,7 @@ void scheduler_tick(void) raw_spin_lock(&rq->lock); update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); - update_cpu_load_active(rq); + cpu_load_update_active(rq); calc_global_load_tick(rq); raw_spin_unlock(&rq->lock); @@ -2961,6 +3046,20 @@ u64 scheduler_tick_max_deferment(void) #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_PREEMPT_TRACER)) +/* + * If the value passed in is equal to the current preempt count + * then we just disabled preemption. Start timing the latency. + */ +static inline void preempt_latency_start(int val) +{ + if (preempt_count() == val) { + unsigned long ip = get_lock_parent_ip(); +#ifdef CONFIG_DEBUG_PREEMPT + current->preempt_disable_ip = ip; +#endif + trace_preempt_off(CALLER_ADDR0, ip); + } +} void preempt_count_add(int val) { @@ -2979,17 +3078,21 @@ void preempt_count_add(int val) DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 10); #endif - if (preempt_count() == val) { - unsigned long ip = get_lock_parent_ip(); -#ifdef CONFIG_DEBUG_PREEMPT - current->preempt_disable_ip = ip; -#endif - trace_preempt_off(CALLER_ADDR0, ip); - } + preempt_latency_start(val); } EXPORT_SYMBOL(preempt_count_add); NOKPROBE_SYMBOL(preempt_count_add); +/* + * If the value passed in equals to the current preempt count + * then we just enabled preemption. Stop timing the latency. + */ +static inline void preempt_latency_stop(int val) +{ + if (preempt_count() == val) + trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); +} + void preempt_count_sub(int val) { #ifdef CONFIG_DEBUG_PREEMPT @@ -3006,13 +3109,15 @@ void preempt_count_sub(int val) return; #endif - if (preempt_count() == val) - trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); + preempt_latency_stop(val); __preempt_count_sub(val); } EXPORT_SYMBOL(preempt_count_sub); NOKPROBE_SYMBOL(preempt_count_sub); +#else +static inline void preempt_latency_start(int val) { } +static inline void preempt_latency_stop(int val) { } #endif /* @@ -3065,7 +3170,7 @@ static inline void schedule_debug(struct task_struct *prev) * Pick up the highest-prio task: */ static inline struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev) +pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { const struct sched_class *class = &fair_sched_class; struct task_struct *p; @@ -3076,20 +3181,20 @@ pick_next_task(struct rq *rq, struct task_struct *prev) */ if (likely(prev->sched_class == class && rq->nr_running == rq->cfs.h_nr_running)) { - p = fair_sched_class.pick_next_task(rq, prev); + p = fair_sched_class.pick_next_task(rq, prev, cookie); if (unlikely(p == RETRY_TASK)) goto again; /* assumes fair_sched_class->next == idle_sched_class */ if (unlikely(!p)) - p = idle_sched_class.pick_next_task(rq, prev); + p = idle_sched_class.pick_next_task(rq, prev, cookie); return p; } again: for_each_class(class) { - p = class->pick_next_task(rq, prev); + p = class->pick_next_task(rq, prev, cookie); if (p) { if (unlikely(p == RETRY_TASK)) goto again; @@ -3143,6 +3248,7 @@ static void __sched notrace __schedule(bool preempt) { struct task_struct *prev, *next; unsigned long *switch_count; + struct pin_cookie cookie; struct rq *rq; int cpu; @@ -3176,7 +3282,7 @@ static void __sched notrace __schedule(bool preempt) */ smp_mb__before_spinlock(); raw_spin_lock(&rq->lock); - lockdep_pin_lock(&rq->lock); + cookie = lockdep_pin_lock(&rq->lock); rq->clock_skip_update <<= 1; /* promote REQ to ACT */ @@ -3198,7 +3304,7 @@ static void __sched notrace __schedule(bool preempt) to_wakeup = wq_worker_sleeping(prev); if (to_wakeup) - try_to_wake_up_local(to_wakeup); + try_to_wake_up_local(to_wakeup, cookie); } } switch_count = &prev->nvcsw; @@ -3207,7 +3313,7 @@ static void __sched notrace __schedule(bool preempt) if (task_on_rq_queued(prev)) update_rq_clock(rq); - next = pick_next_task(rq, prev); + next = pick_next_task(rq, prev, cookie); clear_tsk_need_resched(prev); clear_preempt_need_resched(); rq->clock_skip_update = 0; @@ -3218,9 +3324,9 @@ static void __sched notrace __schedule(bool preempt) ++*switch_count; trace_sched_switch(preempt, prev, next); - rq = context_switch(rq, prev, next); /* unlocks the rq */ + rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */ } else { - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); raw_spin_unlock_irq(&rq->lock); } @@ -3287,8 +3393,23 @@ void __sched schedule_preempt_disabled(void) static void __sched notrace preempt_schedule_common(void) { do { + /* + * Because the function tracer can trace preempt_count_sub() + * and it also uses preempt_enable/disable_notrace(), if + * NEED_RESCHED is set, the preempt_enable_notrace() called + * by the function tracer will call this function again and + * cause infinite recursion. + * + * Preemption must be disabled here before the function + * tracer can trace. Break up preempt_disable() into two + * calls. One to disable preemption without fear of being + * traced. The other to still record the preemption latency, + * which can also be traced by the function tracer. + */ preempt_disable_notrace(); + preempt_latency_start(1); __schedule(true); + preempt_latency_stop(1); preempt_enable_no_resched_notrace(); /* @@ -3340,7 +3461,21 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) return; do { + /* + * Because the function tracer can trace preempt_count_sub() + * and it also uses preempt_enable/disable_notrace(), if + * NEED_RESCHED is set, the preempt_enable_notrace() called + * by the function tracer will call this function again and + * cause infinite recursion. + * + * Preemption must be disabled here before the function + * tracer can trace. Break up preempt_disable() into two + * calls. One to disable preemption without fear of being + * traced. The other to still record the preemption latency, + * which can also be traced by the function tracer. + */ preempt_disable_notrace(); + preempt_latency_start(1); /* * Needs preempt disabled in case user_exit() is traced * and the tracer calls preempt_enable_notrace() causing @@ -3350,6 +3485,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) __schedule(true); exception_exit(prev_ctx); + preempt_latency_stop(1); preempt_enable_no_resched_notrace(); } while (need_resched()); } @@ -3406,12 +3542,13 @@ EXPORT_SYMBOL(default_wake_function); void rt_mutex_setprio(struct task_struct *p, int prio) { int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE; - struct rq *rq; const struct sched_class *prev_class; + struct rq_flags rf; + struct rq *rq; BUG_ON(prio > MAX_PRIO); - rq = __task_rq_lock(p); + rq = __task_rq_lock(p, &rf); /* * Idle task boosting is a nono in general. There is one @@ -3487,7 +3624,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) check_class_changed(rq, p, prev_class, oldprio); out_unlock: preempt_disable(); /* avoid rq from going away on us */ - __task_rq_unlock(rq); + __task_rq_unlock(rq, &rf); balance_callback(rq); preempt_enable(); @@ -3497,7 +3634,7 @@ out_unlock: void set_user_nice(struct task_struct *p, long nice) { int old_prio, delta, queued; - unsigned long flags; + struct rq_flags rf; struct rq *rq; if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) @@ -3506,7 +3643,7 @@ void set_user_nice(struct task_struct *p, long nice) * We have to be careful, if called from sys_setpriority(), * the task might be in the middle of scheduling on another CPU. */ - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected @@ -3537,7 +3674,7 @@ void set_user_nice(struct task_struct *p, long nice) resched_curr(rq); } out_unlock: - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); } EXPORT_SYMBOL(set_user_nice); @@ -3834,11 +3971,11 @@ static int __sched_setscheduler(struct task_struct *p, MAX_RT_PRIO - 1 - attr->sched_priority; int retval, oldprio, oldpolicy = -1, queued, running; int new_effective_prio, policy = attr->sched_policy; - unsigned long flags; const struct sched_class *prev_class; - struct rq *rq; + struct rq_flags rf; int reset_on_fork; int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; + struct rq *rq; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); @@ -3933,13 +4070,13 @@ recheck: * To be able to change p->policy safely, the appropriate * runqueue lock must be held. */ - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); /* * Changing the policy of the stop threads its a very bad idea */ if (p == rq->stop) { - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return -EINVAL; } @@ -3956,7 +4093,7 @@ recheck: goto change; p->sched_reset_on_fork = reset_on_fork; - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return 0; } change: @@ -3970,7 +4107,7 @@ change: if (rt_bandwidth_enabled() && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0 && !task_group_is_autogroup(task_group(p))) { - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return -EPERM; } #endif @@ -3985,7 +4122,7 @@ change: */ if (!cpumask_subset(span, &p->cpus_allowed) || rq->rd->dl_bw.bw == 0) { - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return -EPERM; } } @@ -3995,7 +4132,7 @@ change: /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); goto recheck; } @@ -4005,7 +4142,7 @@ change: * is available. */ if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return -EBUSY; } @@ -4050,7 +4187,7 @@ change: check_class_changed(rq, p, prev_class, oldprio); preempt_disable(); /* avoid rq from going away on us */ - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); if (pi) rt_mutex_adjust_pi(p); @@ -4903,10 +5040,10 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, { struct task_struct *p; unsigned int time_slice; - unsigned long flags; + struct rq_flags rf; + struct timespec t; struct rq *rq; int retval; - struct timespec t; if (pid < 0) return -EINVAL; @@ -4921,11 +5058,11 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, if (retval) goto out_unlock; - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); time_slice = 0; if (p->sched_class->get_rr_interval) time_slice = p->sched_class->get_rr_interval(rq, p); - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); rcu_read_unlock(); jiffies_to_timespec(time_slice, &t); @@ -5001,7 +5138,8 @@ void show_state_filter(unsigned long state_filter) touch_all_softlockup_watchdogs(); #ifdef CONFIG_SCHED_DEBUG - sysrq_sched_debug_show(); + if (!state_filter) + sysrq_sched_debug_show(); #endif rcu_read_unlock(); /* @@ -5163,6 +5301,8 @@ out: #ifdef CONFIG_SMP +static bool sched_smp_initialized __read_mostly; + #ifdef CONFIG_NUMA_BALANCING /* Migrate current task p to target_cpu */ int migrate_task_to(struct task_struct *p, int target_cpu) @@ -5188,11 +5328,11 @@ int migrate_task_to(struct task_struct *p, int target_cpu) */ void sched_setnuma(struct task_struct *p, int nid) { - struct rq *rq; - unsigned long flags; bool queued, running; + struct rq_flags rf; + struct rq *rq; - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); queued = task_on_rq_queued(p); running = task_current(rq, p); @@ -5207,7 +5347,7 @@ void sched_setnuma(struct task_struct *p, int nid) p->sched_class->set_curr_task(rq); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE); - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); } #endif /* CONFIG_NUMA_BALANCING */ @@ -5223,7 +5363,7 @@ void idle_task_exit(void) BUG_ON(cpu_online(smp_processor_id())); if (mm != &init_mm) { - switch_mm(mm, &init_mm, current); + switch_mm_irqs_off(mm, &init_mm, current); finish_arch_post_lock_switch(); } mmdrop(mm); @@ -5271,6 +5411,7 @@ static void migrate_tasks(struct rq *dead_rq) { struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; + struct pin_cookie cookie; int dest_cpu; /* @@ -5302,8 +5443,8 @@ static void migrate_tasks(struct rq *dead_rq) /* * pick_next_task assumes pinned rq->lock. */ - lockdep_pin_lock(&rq->lock); - next = pick_next_task(rq, &fake_task); + cookie = lockdep_pin_lock(&rq->lock); + next = pick_next_task(rq, &fake_task, cookie); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); @@ -5316,7 +5457,7 @@ static void migrate_tasks(struct rq *dead_rq) * because !cpu_active at this point, which means load-balance * will not interfere. Also, stop-machine. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); raw_spin_unlock(&rq->lock); raw_spin_lock(&next->pi_lock); raw_spin_lock(&rq->lock); @@ -5377,127 +5518,13 @@ static void set_rq_offline(struct rq *rq) } } -/* - * migration_call - callback that gets triggered when a CPU is added. - * Here we can start up the necessary migration thread for the new CPU. - */ -static int -migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) +static void set_cpu_rq_start_time(unsigned int cpu) { - int cpu = (long)hcpu; - unsigned long flags; struct rq *rq = cpu_rq(cpu); - switch (action & ~CPU_TASKS_FROZEN) { - - case CPU_UP_PREPARE: - rq->calc_load_update = calc_load_update; - account_reset_rq(rq); - break; - - case CPU_ONLINE: - /* Update our root-domain */ - raw_spin_lock_irqsave(&rq->lock, flags); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - - set_rq_online(rq); - } - raw_spin_unlock_irqrestore(&rq->lock, flags); - break; - -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DYING: - sched_ttwu_pending(); - /* Update our root-domain */ - raw_spin_lock_irqsave(&rq->lock, flags); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_offline(rq); - } - migrate_tasks(rq); - BUG_ON(rq->nr_running != 1); /* the migration thread */ - raw_spin_unlock_irqrestore(&rq->lock, flags); - break; - - case CPU_DEAD: - calc_load_migrate(rq); - break; -#endif - } - - update_max_interval(); - - return NOTIFY_OK; -} - -/* - * Register at high priority so that task migration (migrate_all_tasks) - * happens before everything else. This has to be lower priority than - * the notifier in the perf_event subsystem, though. - */ -static struct notifier_block migration_notifier = { - .notifier_call = migration_call, - .priority = CPU_PRI_MIGRATION, -}; - -static void set_cpu_rq_start_time(void) -{ - int cpu = smp_processor_id(); - struct rq *rq = cpu_rq(cpu); rq->age_stamp = sched_clock_cpu(cpu); } -static int sched_cpu_active(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_STARTING: - set_cpu_rq_start_time(); - return NOTIFY_OK; - - case CPU_DOWN_FAILED: - set_cpu_active(cpu, true); - return NOTIFY_OK; - - default: - return NOTIFY_DONE; - } -} - -static int sched_cpu_inactive(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_PREPARE: - set_cpu_active((long)hcpu, false); - return NOTIFY_OK; - default: - return NOTIFY_DONE; - } -} - -static int __init migration_init(void) -{ - void *cpu = (void *)(long)smp_processor_id(); - int err; - - /* Initialize migration for the boot CPU */ - err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); - BUG_ON(err == NOTIFY_BAD); - migration_call(&migration_notifier, CPU_ONLINE, cpu); - register_cpu_notifier(&migration_notifier); - - /* Register cpu active notifiers */ - cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); - cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); - - return 0; -} -early_initcall(migration_init); - static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ #ifdef CONFIG_SCHED_DEBUG @@ -6645,10 +6672,10 @@ static void sched_init_numa(void) init_numa_topology_type(); } -static void sched_domains_numa_masks_set(int cpu) +static void sched_domains_numa_masks_set(unsigned int cpu) { - int i, j; int node = cpu_to_node(cpu); + int i, j; for (i = 0; i < sched_domains_numa_levels; i++) { for (j = 0; j < nr_node_ids; j++) { @@ -6658,51 +6685,20 @@ static void sched_domains_numa_masks_set(int cpu) } } -static void sched_domains_numa_masks_clear(int cpu) +static void sched_domains_numa_masks_clear(unsigned int cpu) { int i, j; + for (i = 0; i < sched_domains_numa_levels; i++) { for (j = 0; j < nr_node_ids; j++) cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); } } -/* - * Update sched_domains_numa_masks[level][node] array when new cpus - * are onlined. - */ -static int sched_domains_numa_masks_update(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - sched_domains_numa_masks_set(cpu); - break; - - case CPU_DEAD: - sched_domains_numa_masks_clear(cpu); - break; - - default: - return NOTIFY_DONE; - } - - return NOTIFY_OK; -} #else -static inline void sched_init_numa(void) -{ -} - -static int sched_domains_numa_masks_update(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - return 0; -} +static inline void sched_init_numa(void) { } +static void sched_domains_numa_masks_set(unsigned int cpu) { } +static void sched_domains_numa_masks_clear(unsigned int cpu) { } #endif /* CONFIG_NUMA */ static int __sdt_alloc(const struct cpumask *cpu_map) @@ -7092,13 +7088,9 @@ static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ * If we come here as part of a suspend/resume, don't touch cpusets because we * want to restore it back to its original state upon resume anyway. */ -static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, - void *hcpu) +static void cpuset_cpu_active(void) { - switch (action) { - case CPU_ONLINE_FROZEN: - case CPU_DOWN_FAILED_FROZEN: - + if (cpuhp_tasks_frozen) { /* * num_cpus_frozen tracks how many CPUs are involved in suspend * resume sequence. As long as this is not the last online @@ -7108,35 +7100,25 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, num_cpus_frozen--; if (likely(num_cpus_frozen)) { partition_sched_domains(1, NULL, NULL); - break; + return; } - /* * This is the last CPU online operation. So fall through and * restore the original sched domains by considering the * cpuset configurations. */ - - case CPU_ONLINE: - cpuset_update_active_cpus(true); - break; - default: - return NOTIFY_DONE; } - return NOTIFY_OK; + cpuset_update_active_cpus(true); } -static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, - void *hcpu) +static int cpuset_cpu_inactive(unsigned int cpu) { unsigned long flags; - long cpu = (long)hcpu; struct dl_bw *dl_b; bool overflow; int cpus; - switch (action) { - case CPU_DOWN_PREPARE: + if (!cpuhp_tasks_frozen) { rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); @@ -7148,19 +7130,120 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, rcu_read_unlock_sched(); if (overflow) - return notifier_from_errno(-EBUSY); + return -EBUSY; cpuset_update_active_cpus(false); - break; - case CPU_DOWN_PREPARE_FROZEN: + } else { num_cpus_frozen++; partition_sched_domains(1, NULL, NULL); - break; - default: - return NOTIFY_DONE; } - return NOTIFY_OK; + return 0; } +int sched_cpu_activate(unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + set_cpu_active(cpu, true); + + if (sched_smp_initialized) { + sched_domains_numa_masks_set(cpu); + cpuset_cpu_active(); + } + + /* + * Put the rq online, if not already. This happens: + * + * 1) In the early boot process, because we build the real domains + * after all cpus have been brought up. + * + * 2) At runtime, if cpuset_cpu_active() fails to rebuild the + * domains. + */ + raw_spin_lock_irqsave(&rq->lock, flags); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_online(rq); + } + raw_spin_unlock_irqrestore(&rq->lock, flags); + + update_max_interval(); + + return 0; +} + +int sched_cpu_deactivate(unsigned int cpu) +{ + int ret; + + set_cpu_active(cpu, false); + /* + * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU + * users of this state to go away such that all new such users will + * observe it. + * + * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might + * not imply sync_sched(), so wait for both. + * + * Do sync before park smpboot threads to take care the rcu boost case. + */ + if (IS_ENABLED(CONFIG_PREEMPT)) + synchronize_rcu_mult(call_rcu, call_rcu_sched); + else + synchronize_rcu(); + + if (!sched_smp_initialized) + return 0; + + ret = cpuset_cpu_inactive(cpu); + if (ret) { + set_cpu_active(cpu, true); + return ret; + } + sched_domains_numa_masks_clear(cpu); + return 0; +} + +static void sched_rq_cpu_starting(unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + rq->calc_load_update = calc_load_update; + account_reset_rq(rq); + update_max_interval(); +} + +int sched_cpu_starting(unsigned int cpu) +{ + set_cpu_rq_start_time(cpu); + sched_rq_cpu_starting(cpu); + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +int sched_cpu_dying(unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + /* Handle pending wakeups and then migrate everything off */ + sched_ttwu_pending(); + raw_spin_lock_irqsave(&rq->lock, flags); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + migrate_tasks(rq); + BUG_ON(rq->nr_running != 1); + raw_spin_unlock_irqrestore(&rq->lock, flags); + calc_load_migrate(rq); + update_max_interval(); + nohz_balance_exit_idle(cpu); + hrtick_clear(rq); + return 0; +} +#endif + void __init sched_init_smp(void) { cpumask_var_t non_isolated_cpus; @@ -7182,12 +7265,6 @@ void __init sched_init_smp(void) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); mutex_unlock(&sched_domains_mutex); - hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); - hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); - hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); - - init_hrtick(); - /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) BUG(); @@ -7196,7 +7273,16 @@ void __init sched_init_smp(void) init_sched_rt_class(); init_sched_dl_class(); + sched_smp_initialized = true; +} + +static int __init migration_init(void) +{ + sched_rq_cpu_starting(smp_processor_id()); + return 0; } +early_initcall(migration_init); + #else void __init sched_init_smp(void) { @@ -7331,8 +7417,6 @@ void __init sched_init(void) for (j = 0; j < CPU_LOAD_IDX_MAX; j++) rq->cpu_load[j] = 0; - rq->last_load_update_tick = jiffies; - #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; @@ -7351,12 +7435,13 @@ void __init sched_init(void) rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ_COMMON + rq->last_load_update_tick = jiffies; rq->nohz_flags = 0; #endif #ifdef CONFIG_NO_HZ_FULL rq->last_sched_tick = 0; #endif -#endif +#endif /* CONFIG_SMP */ init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); } @@ -7394,7 +7479,7 @@ void __init sched_init(void) if (cpu_isolated_map == NULL) zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); idle_thread_set_boot_cpu(); - set_cpu_rq_start_time(); + set_cpu_rq_start_time(smp_processor_id()); #endif init_sched_fair_class(); @@ -7639,10 +7724,10 @@ void sched_move_task(struct task_struct *tsk) { struct task_group *tg; int queued, running; - unsigned long flags; + struct rq_flags rf; struct rq *rq; - rq = task_rq_lock(tsk, &flags); + rq = task_rq_lock(tsk, &rf); running = task_current(rq, tsk); queued = task_on_rq_queued(tsk); @@ -7674,7 +7759,7 @@ void sched_move_task(struct task_struct *tsk) if (queued) enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); - task_rq_unlock(rq, tsk, &flags); + task_rq_unlock(rq, tsk, &rf); } #endif /* CONFIG_CGROUP_SCHED */ @@ -7894,7 +7979,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) static int sched_rt_global_constraints(void) { unsigned long flags; - int i, ret = 0; + int i; raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { @@ -7906,7 +7991,7 @@ static int sched_rt_global_constraints(void) } raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); - return ret; + return 0; } #endif /* CONFIG_RT_GROUP_SCHED */ diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 4a811203c04a..41f85c4d0938 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -25,11 +25,22 @@ enum cpuacct_stat_index { CPUACCT_STAT_NSTATS, }; +enum cpuacct_usage_index { + CPUACCT_USAGE_USER, /* ... user mode */ + CPUACCT_USAGE_SYSTEM, /* ... kernel mode */ + + CPUACCT_USAGE_NRUSAGE, +}; + +struct cpuacct_usage { + u64 usages[CPUACCT_USAGE_NRUSAGE]; +}; + /* track cpu usage of a group of tasks and its child groups */ struct cpuacct { struct cgroup_subsys_state css; /* cpuusage holds pointer to a u64-type object on every cpu */ - u64 __percpu *cpuusage; + struct cpuacct_usage __percpu *cpuusage; struct kernel_cpustat __percpu *cpustat; }; @@ -49,7 +60,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca) return css_ca(ca->css.parent); } -static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); +static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage); static struct cpuacct root_cpuacct = { .cpustat = &kernel_cpustat, .cpuusage = &root_cpuacct_cpuusage, @@ -68,7 +79,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) if (!ca) goto out; - ca->cpuusage = alloc_percpu(u64); + ca->cpuusage = alloc_percpu(struct cpuacct_usage); if (!ca->cpuusage) goto out_free_ca; @@ -96,20 +107,37 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css) kfree(ca); } -static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, + enum cpuacct_usage_index index) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); u64 data; + /* + * We allow index == CPUACCT_USAGE_NRUSAGE here to read + * the sum of suages. + */ + BUG_ON(index > CPUACCT_USAGE_NRUSAGE); + #ifndef CONFIG_64BIT /* * Take rq->lock to make 64-bit read safe on 32-bit platforms. */ raw_spin_lock_irq(&cpu_rq(cpu)->lock); - data = *cpuusage; +#endif + + if (index == CPUACCT_USAGE_NRUSAGE) { + int i = 0; + + data = 0; + for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) + data += cpuusage->usages[i]; + } else { + data = cpuusage->usages[index]; + } + +#ifndef CONFIG_64BIT raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - data = *cpuusage; #endif return data; @@ -117,69 +145,103 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + int i; #ifndef CONFIG_64BIT /* * Take rq->lock to make 64-bit write safe on 32-bit platforms. */ raw_spin_lock_irq(&cpu_rq(cpu)->lock); - *cpuusage = val; +#endif + + for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) + cpuusage->usages[i] = val; + +#ifndef CONFIG_64BIT raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - *cpuusage = val; #endif } /* return total cpu usage (in nanoseconds) of a group */ -static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) +static u64 __cpuusage_read(struct cgroup_subsys_state *css, + enum cpuacct_usage_index index) { struct cpuacct *ca = css_ca(css); u64 totalcpuusage = 0; int i; - for_each_present_cpu(i) - totalcpuusage += cpuacct_cpuusage_read(ca, i); + for_each_possible_cpu(i) + totalcpuusage += cpuacct_cpuusage_read(ca, i, index); return totalcpuusage; } +static u64 cpuusage_user_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return __cpuusage_read(css, CPUACCT_USAGE_USER); +} + +static u64 cpuusage_sys_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return __cpuusage_read(css, CPUACCT_USAGE_SYSTEM); +} + +static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + return __cpuusage_read(css, CPUACCT_USAGE_NRUSAGE); +} + static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { struct cpuacct *ca = css_ca(css); - int err = 0; - int i; + int cpu; /* * Only allow '0' here to do a reset. */ - if (val) { - err = -EINVAL; - goto out; - } + if (val) + return -EINVAL; - for_each_present_cpu(i) - cpuacct_cpuusage_write(ca, i, 0); + for_each_possible_cpu(cpu) + cpuacct_cpuusage_write(ca, cpu, 0); -out: - return err; + return 0; } -static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) +static int __cpuacct_percpu_seq_show(struct seq_file *m, + enum cpuacct_usage_index index) { struct cpuacct *ca = css_ca(seq_css(m)); u64 percpu; int i; - for_each_present_cpu(i) { - percpu = cpuacct_cpuusage_read(ca, i); + for_each_possible_cpu(i) { + percpu = cpuacct_cpuusage_read(ca, i, index); seq_printf(m, "%llu ", (unsigned long long) percpu); } seq_printf(m, "\n"); return 0; } +static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V) +{ + return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_USER); +} + +static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V) +{ + return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_SYSTEM); +} + +static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) +{ + return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE); +} + static const char * const cpuacct_stat_desc[] = { [CPUACCT_STAT_USER] = "user", [CPUACCT_STAT_SYSTEM] = "system", @@ -191,7 +253,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v) int cpu; s64 val = 0; - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); val += kcpustat->cpustat[CPUTIME_USER]; val += kcpustat->cpustat[CPUTIME_NICE]; @@ -200,7 +262,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v) seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val); val = 0; - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); val += kcpustat->cpustat[CPUTIME_SYSTEM]; val += kcpustat->cpustat[CPUTIME_IRQ]; @@ -220,10 +282,26 @@ static struct cftype files[] = { .write_u64 = cpuusage_write, }, { + .name = "usage_user", + .read_u64 = cpuusage_user_read, + }, + { + .name = "usage_sys", + .read_u64 = cpuusage_sys_read, + }, + { .name = "usage_percpu", .seq_show = cpuacct_percpu_seq_show, }, { + .name = "usage_percpu_user", + .seq_show = cpuacct_percpu_user_seq_show, + }, + { + .name = "usage_percpu_sys", + .seq_show = cpuacct_percpu_sys_seq_show, + }, + { .name = "stat", .seq_show = cpuacct_stats_show, }, @@ -238,10 +316,17 @@ static struct cftype files[] = { void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; + int index = CPUACCT_USAGE_SYSTEM; + struct pt_regs *regs = task_pt_regs(tsk); + + if (regs && user_mode(regs)) + index = CPUACCT_USAGE_USER; rcu_read_lock(); + for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) - *this_cpu_ptr(ca->cpuusage) += cputime; + this_cpu_ptr(ca->cpuusage)->usages[index] += cputime; + rcu_read_unlock(); } diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5a75b08cfd85..5be58820465c 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -103,10 +103,10 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, const struct sched_dl_entity *dl_se = &p->dl; if (later_mask && - cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) { + cpumask_and(later_mask, cp->free_cpus, tsk_cpus_allowed(p))) { best_cpu = cpumask_any(later_mask); goto out; - } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && + } else if (cpumask_test_cpu(cpudl_maximum(cp), tsk_cpus_allowed(p)) && dl_time_before(dl_se->deadline, cp->elements[0].dl)) { best_cpu = cpudl_maximum(cp); if (later_mask) diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 981fcd7dc394..11e9705bf937 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -103,11 +103,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, if (skip) continue; - if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) + if (cpumask_any_and(tsk_cpus_allowed(p), vec->mask) >= nr_cpu_ids) continue; if (lowest_mask) { - cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); + cpumask_and(lowest_mask, tsk_cpus_allowed(p), vec->mask); /* * We have to ensure that we have at least one bit diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 686ec8adf952..fcb7f0217ff4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -134,7 +134,7 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { struct task_struct *p = dl_task_of(dl_se); - if (p->nr_cpus_allowed > 1) + if (tsk_nr_cpus_allowed(p) > 1) dl_rq->dl_nr_migratory++; update_dl_migration(dl_rq); @@ -144,7 +144,7 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { struct task_struct *p = dl_task_of(dl_se); - if (p->nr_cpus_allowed > 1) + if (tsk_nr_cpus_allowed(p) > 1) dl_rq->dl_nr_migratory--; update_dl_migration(dl_rq); @@ -591,10 +591,10 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) struct sched_dl_entity, dl_timer); struct task_struct *p = dl_task_of(dl_se); - unsigned long flags; + struct rq_flags rf; struct rq *rq; - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); /* * The task might have changed its scheduling policy to something @@ -670,14 +670,14 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * Nothing relies on rq->lock after this, so its safe to drop * rq->lock. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, rf.cookie); push_dl_task(rq); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, rf.cookie); } #endif unlock: - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); /* * This can free the task_struct, including this hrtimer, do not touch @@ -717,10 +717,6 @@ static void update_curr_dl(struct rq *rq) if (!dl_task(curr) || !on_dl_rq(dl_se)) return; - /* Kick cpufreq (see the comment in linux/cpufreq.h). */ - if (cpu_of(rq) == smp_processor_id()) - cpufreq_trigger_update(rq_clock(rq)); - /* * Consumed budget is computed considering the time as * observed by schedulable tasks (excluding time spent @@ -736,6 +732,10 @@ static void update_curr_dl(struct rq *rq) return; } + /* kick cpufreq (see the comment in linux/cpufreq.h). */ + if (cpu_of(rq) == smp_processor_id()) + cpufreq_trigger_update(rq_clock(rq)); + schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -966,7 +966,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) enqueue_dl_entity(&p->dl, pi_se, flags); - if (!task_current(rq, p) && p->nr_cpus_allowed > 1) + if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1) enqueue_pushable_dl_task(rq, p); } @@ -1040,9 +1040,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) * try to make it stay here, it might be important. */ if (unlikely(dl_task(curr)) && - (curr->nr_cpus_allowed < 2 || + (tsk_nr_cpus_allowed(curr) < 2 || !dl_entity_preempt(&p->dl, &curr->dl)) && - (p->nr_cpus_allowed > 1)) { + (tsk_nr_cpus_allowed(p) > 1)) { int target = find_later_rq(p); if (target != -1 && @@ -1063,7 +1063,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * Current can't be migrated, useless to reschedule, * let's hope p can move out. */ - if (rq->curr->nr_cpus_allowed == 1 || + if (tsk_nr_cpus_allowed(rq->curr) == 1 || cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1) return; @@ -1071,7 +1071,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * p is migratable, so let's not schedule it and * see if it is pushed or pulled somewhere else. */ - if (p->nr_cpus_allowed != 1 && + if (tsk_nr_cpus_allowed(p) != 1 && cpudl_find(&rq->rd->cpudl, p, NULL) != -1) return; @@ -1125,7 +1125,8 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, return rb_entry(left, struct sched_dl_entity, rb_node); } -struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) +struct task_struct * +pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { struct sched_dl_entity *dl_se; struct task_struct *p; @@ -1140,9 +1141,9 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) * disabled avoiding further scheduler activity on it and we're * being very careful to re-start the picking loop. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); pull_dl_task(rq); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, cookie); /* * pull_rt_task() can drop (and re-acquire) rq->lock; this * means a stop task can slip in, in which case we need to @@ -1185,7 +1186,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p) { update_curr_dl(rq); - if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) + if (on_dl_rq(&p->dl) && tsk_nr_cpus_allowed(p) > 1) enqueue_pushable_dl_task(rq, p); } @@ -1286,7 +1287,7 @@ static int find_later_rq(struct task_struct *task) if (unlikely(!later_mask)) return -1; - if (task->nr_cpus_allowed == 1) + if (tsk_nr_cpus_allowed(task) == 1) return -1; /* @@ -1392,7 +1393,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) if (double_lock_balance(rq, later_rq)) { if (unlikely(task_rq(task) != rq || !cpumask_test_cpu(later_rq->cpu, - &task->cpus_allowed) || + tsk_cpus_allowed(task)) || task_running(rq, task) || !dl_task(task) || !task_on_rq_queued(task))) { @@ -1432,7 +1433,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); - BUG_ON(p->nr_cpus_allowed <= 1); + BUG_ON(tsk_nr_cpus_allowed(p) <= 1); BUG_ON(!task_on_rq_queued(p)); BUG_ON(!dl_task(p)); @@ -1471,7 +1472,7 @@ retry: */ if (dl_task(rq->curr) && dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && - rq->curr->nr_cpus_allowed > 1) { + tsk_nr_cpus_allowed(rq->curr) > 1) { resched_curr(rq); return 0; } @@ -1618,9 +1619,9 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && - p->nr_cpus_allowed > 1 && + tsk_nr_cpus_allowed(p) > 1 && dl_task(rq->curr) && - (rq->curr->nr_cpus_allowed < 2 || + (tsk_nr_cpus_allowed(rq->curr) < 2 || !dl_entity_preempt(&p->dl, &rq->curr->dl))) { push_dl_tasks(rq); } @@ -1724,7 +1725,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) + if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded) queue_push_tasks(rq); #else if (dl_task(rq->curr)) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4fbc3bd5ff60..cf905f655ba1 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -626,15 +626,16 @@ do { \ #undef P #undef PN -#ifdef CONFIG_SCHEDSTATS -#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); -#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); - #ifdef CONFIG_SMP +#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); P64(avg_idle); P64(max_idle_balance_cost); +#undef P64 #endif +#ifdef CONFIG_SCHEDSTATS +#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); + if (schedstat_enabled()) { P(yld_count); P(sched_count); @@ -644,7 +645,6 @@ do { \ } #undef P -#undef P64 #endif spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 40748dc8ea3e..218f8e83db73 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -204,7 +204,7 @@ static void __update_inv_weight(struct load_weight *lw) * OR * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT * - * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case + * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case * we're guaranteed shift stays positive because inv_weight is guaranteed to * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22. * @@ -682,17 +682,68 @@ void init_entity_runnable_average(struct sched_entity *se) sa->period_contrib = 1023; sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; - sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); - sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + /* + * At this point, util_avg won't be used in select_task_rq_fair anyway + */ + sa->util_avg = 0; + sa->util_sum = 0; /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } +/* + * With new tasks being created, their initial util_avgs are extrapolated + * based on the cfs_rq's current util_avg: + * + * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight + * + * However, in many cases, the above util_avg does not give a desired + * value. Moreover, the sum of the util_avgs may be divergent, such + * as when the series is a harmonic series. + * + * To solve this problem, we also cap the util_avg of successive tasks to + * only 1/2 of the left utilization budget: + * + * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n + * + * where n denotes the nth task. + * + * For example, a simplest series from the beginning would be like: + * + * task util_avg: 512, 256, 128, 64, 32, 16, 8, ... + * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ... + * + * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap) + * if util_avg > util_avg_cap. + */ +void post_init_entity_util_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + struct sched_avg *sa = &se->avg; + long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; + + if (cap > 0) { + if (cfs_rq->avg.util_avg != 0) { + sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; + sa->util_avg /= (cfs_rq->avg.load_avg + 1); + + if (sa->util_avg > cap) + sa->util_avg = cap; + } else { + sa->util_avg = cap; + } + sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + } +} + static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); #else void init_entity_runnable_average(struct sched_entity *se) { } +void post_init_entity_util_avg(struct sched_entity *se) +{ +} #endif /* @@ -2437,10 +2488,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); +#ifdef CONFIG_SMP if (entity_is_task(se)) { account_numa_dequeue(rq_of(cfs_rq), task_of(se)); list_del_init(&se->group_node); } +#endif cfs_rq->nr_running--; } @@ -2550,6 +2603,16 @@ static const u32 runnable_avg_yN_sum[] = { }; /* + * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to + * lower integers. See Documentation/scheduler/sched-avg.txt how these + * were generated: + */ +static const u32 __accumulated_sum_N32[] = { + 0, 23371, 35056, 40899, 43820, 45281, + 46011, 46376, 46559, 46650, 46696, 46719, +}; + +/* * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) */ @@ -2597,22 +2660,13 @@ static u32 __compute_runnable_contrib(u64 n) else if (unlikely(n >= LOAD_AVG_MAX_N)) return LOAD_AVG_MAX; - /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */ - do { - contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */ - contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD]; - - n -= LOAD_AVG_PERIOD; - } while (n > LOAD_AVG_PERIOD); - + /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */ + contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD]; + n %= LOAD_AVG_PERIOD; contrib = decay_load(contrib, n); return contrib + runnable_avg_yN_sum[n]; } -#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10 -#error "load tracking assumes 2^10 as unit" -#endif - #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) /* @@ -2821,23 +2875,54 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); +static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); + + if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) { + unsigned long max = rq->cpu_capacity_orig; + + /* + * There are a few boundary cases this might miss but it should + * get called often enough that that should (hopefully) not be + * a real problem -- added to that it only calls on the local + * CPU, so if we enqueue remotely we'll miss an update, but + * the next tick/schedule should update. + * + * It will not get called when we go idle, because the idle + * thread is a different class (!fair), nor will the utilization + * number include things like RT tasks. + * + * As is, the util number is not freq-invariant (we'd have to + * implement arch_scale_freq_capacity() for that). + * + * See cpu_util(). + */ + cpufreq_update_util(rq_clock(rq), + min(cfs_rq->avg.util_avg, max), max); + } +} + /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ -static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) { struct sched_avg *sa = &cfs_rq->avg; - int decayed, removed = 0; + int decayed, removed_load = 0, removed_util = 0; if (atomic_long_read(&cfs_rq->removed_load_avg)) { s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); sa->load_avg = max_t(long, sa->load_avg - r, 0); sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); - removed = 1; + removed_load = 1; } if (atomic_long_read(&cfs_rq->removed_util_avg)) { long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); sa->util_avg = max_t(long, sa->util_avg - r, 0); sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); + removed_util = 1; } decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, @@ -2848,7 +2933,10 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif - return decayed || removed; + if (update_freq && (decayed || removed_util)) + cfs_rq_util_change(cfs_rq); + + return decayed || removed_load; } /* Update task and its cfs_rq load average */ @@ -2867,31 +2955,8 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); - if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) + if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg) update_tg_load_avg(cfs_rq, 0); - - if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) { - unsigned long max = rq->cpu_capacity_orig; - - /* - * There are a few boundary cases this might miss but it should - * get called often enough that that should (hopefully) not be - * a real problem -- added to that it only calls on the local - * CPU, so if we enqueue remotely we'll miss an update, but - * the next tick/schedule should update. - * - * It will not get called when we go idle, because the idle - * thread is a different class (!fair), nor will the utilization - * number include things like RT tasks. - * - * As is, the util number is not freq-invariant (we'd have to - * implement arch_scale_freq_capacity() for that). - * - * See cpu_util(). - */ - cpufreq_update_util(rq_clock(rq), - min(cfs_rq->avg.util_avg, max), max); - } } static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -2919,6 +2984,8 @@ skip_aging: cfs_rq->avg.load_sum += se->avg.load_sum; cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; + + cfs_rq_util_change(cfs_rq); } static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -2931,6 +2998,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); + + cfs_rq_util_change(cfs_rq); } /* Add the load generated by se into cfs_rq's load average */ @@ -2948,7 +3017,7 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->curr == se, NULL); } - decayed = update_cfs_rq_load_avg(now, cfs_rq); + decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated); cfs_rq->runnable_load_avg += sa->load_avg; cfs_rq->runnable_load_sum += sa->load_sum; @@ -3185,10 +3254,41 @@ static inline void check_schedstat_required(void) #endif } + +/* + * MIGRATION + * + * dequeue + * update_curr() + * update_min_vruntime() + * vruntime -= min_vruntime + * + * enqueue + * update_curr() + * update_min_vruntime() + * vruntime += min_vruntime + * + * this way the vruntime transition between RQs is done when both + * min_vruntime are up-to-date. + * + * WAKEUP (remote) + * + * ->migrate_task_rq_fair() (p->state == TASK_WAKING) + * vruntime -= min_vruntime + * + * enqueue + * update_curr() + * update_min_vruntime() + * vruntime += min_vruntime + * + * this way we don't have the most up-to-date min_vruntime on the originating + * CPU and an up-to-date min_vruntime on the destination CPU. + */ + static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING); + bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED); bool curr = cfs_rq->curr == se; /* @@ -3202,7 +3302,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* * Otherwise, renormalise after, such that we're placed at the current - * moment in time, instead of some random moment in the past. + * moment in time, instead of some random moment in the past. Being + * placed in the past could significantly boost this task to the + * fairness detriment of existing tasks. */ if (renorm && !curr) se->vruntime += cfs_rq->min_vruntime; @@ -4430,7 +4532,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP - +#ifdef CONFIG_NO_HZ_COMMON /* * per rq 'load' arrray crap; XXX kill this. */ @@ -4496,13 +4598,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) } return load; } +#endif /* CONFIG_NO_HZ_COMMON */ /** - * __update_cpu_load - update the rq->cpu_load[] statistics + * __cpu_load_update - update the rq->cpu_load[] statistics * @this_rq: The rq to update statistics for * @this_load: The current load * @pending_updates: The number of missed updates - * @active: !0 for NOHZ_FULL * * Update rq->cpu_load[] statistics. This function is usually called every * scheduler tick (TICK_NSEC). @@ -4531,12 +4633,12 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) * load[i]_n = (1 - 1/2^i)^n * load[i]_0 * * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra - * term. See the @active paramter. + * term. */ -static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, - unsigned long pending_updates, int active) +static void cpu_load_update(struct rq *this_rq, unsigned long this_load, + unsigned long pending_updates) { - unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0; + unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0]; int i, scale; this_rq->nr_load_updates++; @@ -4549,6 +4651,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, /* scale is effectively 1 << i now, and >> i divides by scale */ old_load = this_rq->cpu_load[i]; +#ifdef CONFIG_NO_HZ_COMMON old_load = decay_load_missed(old_load, pending_updates - 1, i); if (tickless_load) { old_load -= decay_load_missed(tickless_load, pending_updates - 1, i); @@ -4559,6 +4662,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, */ old_load += tickless_load; } +#endif new_load = this_load; /* * Round up the averaging division if load is increasing. This @@ -4581,10 +4685,23 @@ static unsigned long weighted_cpuload(const int cpu) } #ifdef CONFIG_NO_HZ_COMMON -static void __update_cpu_load_nohz(struct rq *this_rq, - unsigned long curr_jiffies, - unsigned long load, - int active) +/* + * There is no sane way to deal with nohz on smp when using jiffies because the + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. + * + * Therefore we need to avoid the delta approach from the regular tick when + * possible since that would seriously skew the load calculation. This is why we + * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on + * jiffies deltas for updates happening while in nohz mode (idle ticks, idle + * loop exit, nohz_idle_balance, nohz full exit...) + * + * This means we might still be one tick off for nohz periods. + */ + +static void cpu_load_update_nohz(struct rq *this_rq, + unsigned long curr_jiffies, + unsigned long load) { unsigned long pending_updates; @@ -4596,28 +4713,15 @@ static void __update_cpu_load_nohz(struct rq *this_rq, * In the NOHZ_FULL case, we were non-idle, we should consider * its weighted load. */ - __update_cpu_load(this_rq, load, pending_updates, active); + cpu_load_update(this_rq, load, pending_updates); } } /* - * There is no sane way to deal with nohz on smp when using jiffies because the - * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading - * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. - * - * Therefore we cannot use the delta approach from the regular tick since that - * would seriously skew the load calculation. However we'll make do for those - * updates happening while idle (nohz_idle_balance) or coming out of idle - * (tick_nohz_idle_exit). - * - * This means we might still be one tick off for nohz periods. - */ - -/* * Called from nohz_idle_balance() to update the load ratings before doing the * idle balance. */ -static void update_cpu_load_idle(struct rq *this_rq) +static void cpu_load_update_idle(struct rq *this_rq) { /* * bail if there's load or we're actually up-to-date. @@ -4625,38 +4729,71 @@ static void update_cpu_load_idle(struct rq *this_rq) if (weighted_cpuload(cpu_of(this_rq))) return; - __update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0); + cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0); } /* - * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + * Record CPU load on nohz entry so we know the tickless load to account + * on nohz exit. cpu_load[0] happens then to be updated more frequently + * than other cpu_load[idx] but it should be fine as cpu_load readers + * shouldn't rely into synchronized cpu_load[*] updates. */ -void update_cpu_load_nohz(int active) +void cpu_load_update_nohz_start(void) { struct rq *this_rq = this_rq(); + + /* + * This is all lockless but should be fine. If weighted_cpuload changes + * concurrently we'll exit nohz. And cpu_load write can race with + * cpu_load_update_idle() but both updater would be writing the same. + */ + this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq)); +} + +/* + * Account the tickless load in the end of a nohz frame. + */ +void cpu_load_update_nohz_stop(void) +{ unsigned long curr_jiffies = READ_ONCE(jiffies); - unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0; + struct rq *this_rq = this_rq(); + unsigned long load; if (curr_jiffies == this_rq->last_load_update_tick) return; + load = weighted_cpuload(cpu_of(this_rq)); raw_spin_lock(&this_rq->lock); - __update_cpu_load_nohz(this_rq, curr_jiffies, load, active); + update_rq_clock(this_rq); + cpu_load_update_nohz(this_rq, curr_jiffies, load); raw_spin_unlock(&this_rq->lock); } -#endif /* CONFIG_NO_HZ */ +#else /* !CONFIG_NO_HZ_COMMON */ +static inline void cpu_load_update_nohz(struct rq *this_rq, + unsigned long curr_jiffies, + unsigned long load) { } +#endif /* CONFIG_NO_HZ_COMMON */ + +static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load) +{ +#ifdef CONFIG_NO_HZ_COMMON + /* See the mess around cpu_load_update_nohz(). */ + this_rq->last_load_update_tick = READ_ONCE(jiffies); +#endif + cpu_load_update(this_rq, load, 1); +} /* * Called from scheduler_tick() */ -void update_cpu_load_active(struct rq *this_rq) +void cpu_load_update_active(struct rq *this_rq) { unsigned long load = weighted_cpuload(cpu_of(this_rq)); - /* - * See the mess around update_cpu_load_idle() / update_cpu_load_nohz(). - */ - this_rq->last_load_update_tick = jiffies; - __update_cpu_load(this_rq, load, 1, 1); + + if (tick_nohz_tick_stopped()) + cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load); + else + cpu_load_update_periodic(this_rq, load); } /* @@ -4714,46 +4851,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) return 0; } -static void record_wakee(struct task_struct *p) -{ - /* - * Rough decay (wiping) for cost saving, don't worry - * about the boundary, really active task won't care - * about the loss. - */ - if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) { - current->wakee_flips >>= 1; - current->wakee_flip_decay_ts = jiffies; - } - - if (current->last_wakee != p) { - current->last_wakee = p; - current->wakee_flips++; - } -} - -static void task_waking_fair(struct task_struct *p) -{ - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 min_vruntime; - -#ifndef CONFIG_64BIT - u64 min_vruntime_copy; - - do { - min_vruntime_copy = cfs_rq->min_vruntime_copy; - smp_rmb(); - min_vruntime = cfs_rq->min_vruntime; - } while (min_vruntime != min_vruntime_copy); -#else - min_vruntime = cfs_rq->min_vruntime; -#endif - - se->vruntime -= min_vruntime; - record_wakee(p); -} - #ifdef CONFIG_FAIR_GROUP_SCHED /* * effective_load() calculates the load change as seen from the root_task_group @@ -4869,17 +4966,39 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif +static void record_wakee(struct task_struct *p) +{ + /* + * Only decay a single time; tasks that have less then 1 wakeup per + * jiffy will not have built up many flips. + */ + if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) { + current->wakee_flips >>= 1; + current->wakee_flip_decay_ts = jiffies; + } + + if (current->last_wakee != p) { + current->last_wakee = p; + current->wakee_flips++; + } +} + /* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. + * * A waker of many should wake a different task than the one last awakened - * at a frequency roughly N times higher than one of its wakees. In order - * to determine whether we should let the load spread vs consolodating to - * shared cache, we look for a minimum 'flip' frequency of llc_size in one - * partner, and a factor of lls_size higher frequency in the other. With - * both conditions met, we can be relatively sure that the relationship is - * non-monogamous, with partner count exceeding socket size. Waker/wakee - * being client/server, worker/dispatcher, interrupt source or whatever is - * irrelevant, spread criteria is apparent partner count exceeds socket size. + * at a frequency roughly N times higher than one of its wakees. + * + * In order to determine whether we should let the load spread vs consolidating + * to shared cache, we look for a minimum 'flip' frequency of llc_size in one + * partner, and a factor of lls_size higher frequency in the other. + * + * With both conditions met, we can be relatively sure that the relationship is + * non-monogamous, with partner count exceeding socket size. + * + * Waker/wakee being client/server, worker/dispatcher, interrupt source or + * whatever is irrelevant, spread criteria is apparent partner count exceeds + * socket size. */ static int wake_wide(struct task_struct *p) { @@ -5184,8 +5303,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int want_affine = 0; int sync = wake_flags & WF_SYNC; - if (sd_flag & SD_BALANCE_WAKE) + if (sd_flag & SD_BALANCE_WAKE) { + record_wakee(p); want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + } rcu_read_lock(); for_each_domain(cpu, tmp) { @@ -5265,6 +5386,32 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f static void migrate_task_rq_fair(struct task_struct *p) { /* + * As blocked tasks retain absolute vruntime the migration needs to + * deal with this by subtracting the old and adding the new + * min_vruntime -- the latter is done by enqueue_entity() when placing + * the task on the new runqueue. + */ + if (p->state == TASK_WAKING) { + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 min_vruntime; + +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; + + do { + min_vruntime_copy = cfs_rq->min_vruntime_copy; + smp_rmb(); + min_vruntime = cfs_rq->min_vruntime; + } while (min_vruntime != min_vruntime_copy); +#else + min_vruntime = cfs_rq->min_vruntime; +#endif + + se->vruntime -= min_vruntime; + } + + /* * We are supposed to update the task to "current" time, then its up to date * and ready to go to new CPU/cfs_rq. But we have difficulty in getting * what current time is, so simply throw away the out-of-date time. This @@ -5447,7 +5594,7 @@ preempt: } static struct task_struct * -pick_next_task_fair(struct rq *rq, struct task_struct *prev) +pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; @@ -5560,9 +5707,9 @@ idle: * further scheduler activity on it and we're being very careful to * re-start the picking loop. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); new_tasks = idle_balance(rq); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, cookie); /* * Because idle_balance() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we @@ -5661,7 +5808,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * W_i,0 = \Sum_j w_i,j (2) * * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight - * is derived from the nice value as per prio_to_weight[]. + * is derived from the nice value as per sched_prio_to_weight[]. * * The weight average is an exponential decay average of the instantaneous * weight: @@ -6163,7 +6310,7 @@ static void update_blocked_averages(int cpu) if (throttled_hierarchy(cfs_rq)) continue; - if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) + if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) update_tg_load_avg(cfs_rq, 0); } raw_spin_unlock_irqrestore(&rq->lock, flags); @@ -6224,7 +6371,7 @@ static inline void update_blocked_averages(int cpu) raw_spin_lock_irqsave(&rq->lock, flags); update_rq_clock(rq); - update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); + update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -6633,6 +6780,9 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (!(env->sd->flags & SD_ASYM_PACKING)) return true; + /* No ASYM_PACKING if target cpu is already busy */ + if (env->idle == CPU_NOT_IDLE) + return true; /* * ASYM_PACKING needs to move all the work to the lowest * numbered CPUs in the group, therefore mark all groups @@ -6642,7 +6792,8 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (!sds->busiest) return true; - if (group_first_cpu(sds->busiest) > group_first_cpu(sg)) + /* Prefer to move from highest possible cpu's work */ + if (group_first_cpu(sds->busiest) < group_first_cpu(sg)) return true; } @@ -6788,6 +6939,9 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) if (!(env->sd->flags & SD_ASYM_PACKING)) return 0; + if (env->idle == CPU_NOT_IDLE) + return 0; + if (!sds->busiest) return 0; @@ -6896,9 +7050,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s } /* - * In the presence of smp nice balancing, certain scenarios can have - * max load less than avg load(as we skip the groups at or below - * its cpu_capacity, while calculating max_load..) + * Avg load of busiest sg can be less and avg load of local sg can + * be greater than avg load across all sgs of sd because avg load + * factors in sg capacity and sgs with smaller group_type are + * skipped when updating the busiest sg: */ if (busiest->avg_load <= sds->avg_load || local->avg_load >= sds->avg_load) { @@ -6911,11 +7066,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ if (busiest->group_type == group_overloaded && local->group_type == group_overloaded) { - load_above_capacity = busiest->sum_nr_running * - SCHED_LOAD_SCALE; - if (load_above_capacity > busiest->group_capacity) + load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE; + if (load_above_capacity > busiest->group_capacity) { load_above_capacity -= busiest->group_capacity; - else + load_above_capacity *= NICE_0_LOAD; + load_above_capacity /= busiest->group_capacity; + } else load_above_capacity = ~0UL; } @@ -6923,9 +7079,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * We're trying to get all the cpus to the average_load, so we don't * want to push ourselves above the average load, nor do we wish to * reduce the max loaded cpu below the average load. At the same time, - * we also don't want to reduce the group load below the group capacity - * (so that we can implement power-savings policies etc). Thus we look - * for the minimum possible imbalance. + * we also don't want to reduce the group load below the group + * capacity. Thus we look for the minimum possible imbalance. */ max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity); @@ -6949,10 +7104,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /** * find_busiest_group - Returns the busiest group within the sched_domain - * if there is an imbalance. If there isn't an imbalance, and - * the user has opted for power-savings, it returns a group whose - * CPUs can be put to idle by rebalancing those tasks elsewhere, if - * such a group exists. + * if there is an imbalance. * * Also calculates the amount of weighted load which should be moved * to restore balance. @@ -6960,9 +7112,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * @env: The load balancing environment. * * Return: - The busiest group if imbalance exists. - * - If no imbalance and user has opted for power-savings balance, - * return the least loaded group whose CPUs can be - * put to idle by rebalancing its tasks onto our group. */ static struct sched_group *find_busiest_group(struct lb_env *env) { @@ -6980,8 +7129,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) busiest = &sds.busiest_stat; /* ASYM feature bypasses nice load balance check */ - if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && - check_asym_packing(env, &sds)) + if (check_asym_packing(env, &sds)) return sds.busiest; /* There is no busy sibling group to pull tasks from */ @@ -7406,10 +7554,7 @@ more_balance: &busiest->active_balance_work); } - /* - * We've kicked active balancing, reset the failure - * counter. - */ + /* We've kicked active balancing, force task migration. */ sd->nr_balance_failed = sd->cache_nice_tries+1; } } else @@ -7644,10 +7789,13 @@ static int active_load_balance_cpu_stop(void *data) schedstat_inc(sd, alb_count); p = detach_one_task(&env); - if (p) + if (p) { schedstat_inc(sd, alb_pushed); - else + /* Active balancing done, reset the failure counter. */ + sd->nr_balance_failed = 0; + } else { schedstat_inc(sd, alb_failed); + } } rcu_read_unlock(); out_unlock: @@ -7718,7 +7866,7 @@ static void nohz_balancer_kick(void) return; } -static inline void nohz_balance_exit_idle(int cpu) +void nohz_balance_exit_idle(unsigned int cpu) { if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { /* @@ -7791,18 +7939,6 @@ void nohz_balance_enter_idle(int cpu) atomic_inc(&nohz.nr_cpus); set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } - -static int sched_ilb_notifier(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DYING: - nohz_balance_exit_idle(smp_processor_id()); - return NOTIFY_OK; - default: - return NOTIFY_DONE; - } -} #endif static DEFINE_SPINLOCK(balancing); @@ -7964,7 +8100,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) if (time_after_eq(jiffies, rq->next_balance)) { raw_spin_lock_irq(&rq->lock); update_rq_clock(rq); - update_cpu_load_idle(rq); + cpu_load_update_idle(rq); raw_spin_unlock_irq(&rq->lock); rebalance_domains(rq, CPU_IDLE); } @@ -8389,6 +8525,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); init_entity_runnable_average(se); + post_init_entity_util_avg(se); } return 1; @@ -8545,7 +8682,6 @@ const struct sched_class fair_sched_class = { .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, - .task_waking = task_waking_fair, .task_dead = task_dead_fair, .set_cpus_allowed = set_cpus_allowed_common, #endif @@ -8607,7 +8743,6 @@ __init void init_sched_fair_class(void) #ifdef CONFIG_NO_HZ_COMMON nohz.next_balance = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); - cpu_notifier(sched_ilb_notifier, 0); #endif #endif /* SMP */ diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 47ce94931f1b..2ce5458bbe1d 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -24,7 +24,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl } static struct task_struct * -pick_next_task_idle(struct rq *rq, struct task_struct *prev) +pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { put_prev_task(rq, prev); diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index ef7159012cf3..b0b93fd33af9 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -99,10 +99,13 @@ long calc_load_fold_active(struct rq *this_rq) static unsigned long calc_load(unsigned long load, unsigned long exp, unsigned long active) { - load *= exp; - load += active * (FIXED_1 - exp); - load += 1UL << (FSHIFT - 1); - return load >> FSHIFT; + unsigned long newload; + + newload = load * exp + active * (FIXED_1 - exp); + if (active >= load) + newload += FIXED_1-1; + + return newload / FIXED_1; } #ifdef CONFIG_NO_HZ_COMMON diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ec4f538d4396..d5690b722691 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -334,7 +334,7 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rt_rq = &rq_of_rt_rq(rt_rq)->rt; rt_rq->rt_nr_total++; - if (p->nr_cpus_allowed > 1) + if (tsk_nr_cpus_allowed(p) > 1) rt_rq->rt_nr_migratory++; update_rt_migration(rt_rq); @@ -351,7 +351,7 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rt_rq = &rq_of_rt_rq(rt_rq)->rt; rt_rq->rt_nr_total--; - if (p->nr_cpus_allowed > 1) + if (tsk_nr_cpus_allowed(p) > 1) rt_rq->rt_nr_migratory--; update_rt_migration(rt_rq); @@ -953,14 +953,14 @@ static void update_curr_rt(struct rq *rq) if (curr->sched_class != &rt_sched_class) return; - /* Kick cpufreq (see the comment in linux/cpufreq.h). */ - if (cpu_of(rq) == smp_processor_id()) - cpufreq_trigger_update(rq_clock(rq)); - delta_exec = rq_clock_task(rq) - curr->se.exec_start; if (unlikely((s64)delta_exec <= 0)) return; + /* Kick cpufreq (see the comment in linux/cpufreq.h). */ + if (cpu_of(rq) == smp_processor_id()) + cpufreq_trigger_update(rq_clock(rq)); + schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -1324,7 +1324,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) enqueue_rt_entity(rt_se, flags); - if (!task_current(rq, p) && p->nr_cpus_allowed > 1) + if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1) enqueue_pushable_task(rq, p); } @@ -1413,7 +1413,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) * will have to sort it out. */ if (curr && unlikely(rt_task(curr)) && - (curr->nr_cpus_allowed < 2 || + (tsk_nr_cpus_allowed(curr) < 2 || curr->prio <= p->prio)) { int target = find_lowest_rq(p); @@ -1437,7 +1437,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) * Current can't be migrated, useless to reschedule, * let's hope p can move out. */ - if (rq->curr->nr_cpus_allowed == 1 || + if (tsk_nr_cpus_allowed(rq->curr) == 1 || !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) return; @@ -1445,7 +1445,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) * p is migratable, so let's not schedule it and * see if it is pushed or pulled somewhere else. */ - if (p->nr_cpus_allowed != 1 + if (tsk_nr_cpus_allowed(p) != 1 && cpupri_find(&rq->rd->cpupri, p, NULL)) return; @@ -1524,7 +1524,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) } static struct task_struct * -pick_next_task_rt(struct rq *rq, struct task_struct *prev) +pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { struct task_struct *p; struct rt_rq *rt_rq = &rq->rt; @@ -1536,9 +1536,9 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) * disabled avoiding further scheduler activity on it and we're * being very careful to re-start the picking loop. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); pull_rt_task(rq); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, cookie); /* * pull_rt_task() can drop (and re-acquire) rq->lock; this * means a dl or stop task can slip in, in which case we need @@ -1579,7 +1579,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) * The previous task needs to be made eligible for pushing * if it is still active */ - if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) + if (on_rt_rq(&p->rt) && tsk_nr_cpus_allowed(p) > 1) enqueue_pushable_task(rq, p); } @@ -1629,7 +1629,7 @@ static int find_lowest_rq(struct task_struct *task) if (unlikely(!lowest_mask)) return -1; - if (task->nr_cpus_allowed == 1) + if (tsk_nr_cpus_allowed(task) == 1) return -1; /* No other targets possible */ if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) @@ -1762,7 +1762,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); - BUG_ON(p->nr_cpus_allowed <= 1); + BUG_ON(tsk_nr_cpus_allowed(p) <= 1); BUG_ON(!task_on_rq_queued(p)); BUG_ON(!rt_task(p)); @@ -2122,9 +2122,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && - p->nr_cpus_allowed > 1 && + tsk_nr_cpus_allowed(p) > 1 && (dl_task(rq->curr) || rt_task(rq->curr)) && - (rq->curr->nr_cpus_allowed < 2 || + (tsk_nr_cpus_allowed(rq->curr) < 2 || rq->curr->prio <= p->prio)) push_rt_tasks(rq); } @@ -2197,7 +2197,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) */ if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) + if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded) queue_push_tasks(rq); #else if (p->prio < rq->curr->prio) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ec2e8d23527e..e51145e76807 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -31,9 +31,9 @@ extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq); #ifdef CONFIG_SMP -extern void update_cpu_load_active(struct rq *this_rq); +extern void cpu_load_update_active(struct rq *this_rq); #else -static inline void update_cpu_load_active(struct rq *this_rq) { } +static inline void cpu_load_update_active(struct rq *this_rq) { } #endif /* @@ -49,25 +49,32 @@ static inline void update_cpu_load_active(struct rq *this_rq) { } * and does not change the user-interface for setting shares/weights. * * We increase resolution only if we have enough bits to allow this increased - * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution - * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the - * increased costs. + * resolution (i.e. 64bit). The costs for increasing resolution when 32bit are + * pretty high and the returns do not justify the increased costs. + * + * Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to + * increase coverage and consistency always enable it on 64bit platforms. */ -#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */ -# define SCHED_LOAD_RESOLUTION 10 -# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION) -# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION) +#ifdef CONFIG_64BIT +# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) +# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT) +# define scale_load_down(w) ((w) >> SCHED_FIXEDPOINT_SHIFT) #else -# define SCHED_LOAD_RESOLUTION 0 +# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT) # define scale_load(w) (w) # define scale_load_down(w) (w) #endif -#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION) -#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) - -#define NICE_0_LOAD SCHED_LOAD_SCALE -#define NICE_0_SHIFT SCHED_LOAD_SHIFT +/* + * Task weight (visible to users) and its load (invisible to users) have + * independent resolution, but they should be well calibrated. We use + * scale_load() and scale_load_down(w) to convert between them. The + * following must be true: + * + * scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD + * + */ +#define NICE_0_LOAD (1L << NICE_0_LOAD_SHIFT) /* * Single value that decides SCHED_DEADLINE internal math precision. @@ -585,11 +592,13 @@ struct rq { #endif #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; - unsigned long last_load_update_tick; #ifdef CONFIG_NO_HZ_COMMON +#ifdef CONFIG_SMP + unsigned long last_load_update_tick; +#endif /* CONFIG_SMP */ u64 nohz_stamp; unsigned long nohz_flags; -#endif +#endif /* CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL unsigned long last_sched_tick; #endif @@ -854,7 +863,7 @@ DECLARE_PER_CPU(struct sched_domain *, sd_asym); struct sched_group_capacity { atomic_t ref; /* - * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity + * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity * for a single CPU. */ unsigned int capacity; @@ -1159,7 +1168,7 @@ extern const u32 sched_prio_to_wmult[40]; * * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) - * ENQUEUE_WAKING - sched_class::task_waking was called + * ENQUEUE_MIGRATED - the task was migrated during wakeup * */ @@ -1174,9 +1183,9 @@ extern const u32 sched_prio_to_wmult[40]; #define ENQUEUE_HEAD 0x08 #define ENQUEUE_REPLENISH 0x10 #ifdef CONFIG_SMP -#define ENQUEUE_WAKING 0x20 +#define ENQUEUE_MIGRATED 0x20 #else -#define ENQUEUE_WAKING 0x00 +#define ENQUEUE_MIGRATED 0x00 #endif #define RETRY_TASK ((void *)-1UL) @@ -1200,14 +1209,14 @@ struct sched_class { * tasks. */ struct task_struct * (*pick_next_task) (struct rq *rq, - struct task_struct *prev); + struct task_struct *prev, + struct pin_cookie cookie); void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); void (*migrate_task_rq)(struct task_struct *p); - void (*task_waking) (struct task_struct *task); void (*task_woken) (struct rq *this_rq, struct task_struct *task); void (*set_cpus_allowed)(struct task_struct *p, @@ -1313,6 +1322,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se); unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); +extern void post_init_entity_util_avg(struct sched_entity *se); #ifdef CONFIG_NO_HZ_FULL extern bool sched_can_stop_tick(struct rq *rq); @@ -1448,86 +1458,32 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } static inline void sched_avg_update(struct rq *rq) { } #endif -/* - * __task_rq_lock - lock the rq @p resides on. - */ -static inline struct rq *__task_rq_lock(struct task_struct *p) - __acquires(rq->lock) -{ - struct rq *rq; - - lockdep_assert_held(&p->pi_lock); - - for (;;) { - rq = task_rq(p); - raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { - lockdep_pin_lock(&rq->lock); - return rq; - } - raw_spin_unlock(&rq->lock); - - while (unlikely(task_on_rq_migrating(p))) - cpu_relax(); - } -} +struct rq_flags { + unsigned long flags; + struct pin_cookie cookie; +}; -/* - * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. - */ -static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) +struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(rq->lock); +struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(p->pi_lock) - __acquires(rq->lock) -{ - struct rq *rq; - - for (;;) { - raw_spin_lock_irqsave(&p->pi_lock, *flags); - rq = task_rq(p); - raw_spin_lock(&rq->lock); - /* - * move_queued_task() task_rq_lock() - * - * ACQUIRE (rq->lock) - * [S] ->on_rq = MIGRATING [L] rq = task_rq() - * WMB (__set_task_cpu()) ACQUIRE (rq->lock); - * [S] ->cpu = new_cpu [L] task_rq() - * [L] ->on_rq - * RELEASE (rq->lock) - * - * If we observe the old cpu in task_rq_lock, the acquire of - * the old rq->lock will fully serialize against the stores. - * - * If we observe the new cpu in task_rq_lock, the acquire will - * pair with the WMB to ensure we must then also see migrating. - */ - if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { - lockdep_pin_lock(&rq->lock); - return rq; - } - raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, *flags); - - while (unlikely(task_on_rq_migrating(p))) - cpu_relax(); - } -} + __acquires(rq->lock); -static inline void __task_rq_unlock(struct rq *rq) +static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, rf->cookie); raw_spin_unlock(&rq->lock); } static inline void -task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) +task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) __releases(rq->lock) __releases(p->pi_lock) { - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, rf->cookie); raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, *flags); + raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); } #ifdef CONFIG_SMP @@ -1743,6 +1699,10 @@ enum rq_nohz_flag_bits { }; #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) + +extern void nohz_balance_exit_idle(unsigned int cpu); +#else +static inline void nohz_balance_exit_idle(unsigned int cpu) { } #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index cbc67da10954..604297a08b3a 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -24,7 +24,7 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) } static struct task_struct * -pick_next_task_stop(struct rq *rq, struct task_struct *prev) +pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { struct task_struct *stop = rq->stop; diff --git a/kernel/signal.c b/kernel/signal.c index aa9bf00749c1..ab122a2cee41 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3099,12 +3099,14 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s oss.ss_sp = (void __user *) current->sas_ss_sp; oss.ss_size = current->sas_ss_size; - oss.ss_flags = sas_ss_flags(sp); + oss.ss_flags = sas_ss_flags(sp) | + (current->sas_ss_flags & SS_FLAG_BITS); if (uss) { void __user *ss_sp; size_t ss_size; - int ss_flags; + unsigned ss_flags; + int ss_mode; error = -EFAULT; if (!access_ok(VERIFY_READ, uss, sizeof(*uss))) @@ -3119,18 +3121,13 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s if (on_sig_stack(sp)) goto out; + ss_mode = ss_flags & ~SS_FLAG_BITS; error = -EINVAL; - /* - * Note - this code used to test ss_flags incorrectly: - * old code may have been written using ss_flags==0 - * to mean ss_flags==SS_ONSTACK (as this was the only - * way that worked) - this fix preserves that older - * mechanism. - */ - if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0) + if (ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK && + ss_mode != 0) goto out; - if (ss_flags == SS_DISABLE) { + if (ss_mode == SS_DISABLE) { ss_size = 0; ss_sp = NULL; } else { @@ -3141,6 +3138,7 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s current->sas_ss_sp = (unsigned long) ss_sp; current->sas_ss_size = ss_size; + current->sas_ss_flags = ss_flags; } error = 0; @@ -3171,9 +3169,14 @@ int restore_altstack(const stack_t __user *uss) int __save_altstack(stack_t __user *uss, unsigned long sp) { struct task_struct *t = current; - return __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) | - __put_user(sas_ss_flags(sp), &uss->ss_flags) | + int err = __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) | + __put_user(t->sas_ss_flags, &uss->ss_flags) | __put_user(t->sas_ss_size, &uss->ss_size); + if (err) + return err; + if (t->sas_ss_flags & SS_AUTODISARM) + sas_ss_reset(t); + return 0; } #ifdef CONFIG_COMPAT diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 725587f10667..c8b318663525 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -130,6 +130,9 @@ static int one_thousand = 1000; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; #endif +#ifdef CONFIG_PERF_EVENTS +static int six_hundred_forty_kb = 640 * 1024; +#endif /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; @@ -1144,6 +1147,15 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one_hundred, }, + { + .procname = "perf_event_max_stack", + .data = NULL, /* filled in by handler */ + .maxlen = sizeof(sysctl_perf_event_max_stack), + .mode = 0644, + .proc_handler = perf_event_max_stack_handler, + .extra1 = &zero, + .extra2 = &six_hundred_forty_kb, + }, #endif #ifdef CONFIG_KMEMCHECK { diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 58e3310c9b21..536ada80f6dd 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -262,7 +262,7 @@ static void tick_nohz_dep_set_all(atomic_t *dep, { int prev; - prev = atomic_fetch_or(dep, BIT(bit)); + prev = atomic_fetch_or(BIT(bit), dep); if (!prev) tick_nohz_full_kick_all(); } @@ -292,7 +292,7 @@ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) ts = per_cpu_ptr(&tick_cpu_sched, cpu); - prev = atomic_fetch_or(&ts->tick_dep_mask, BIT(bit)); + prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask); if (!prev) { preempt_disable(); /* Perf needs local kick that is NMI safe */ @@ -776,6 +776,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, if (!ts->tick_stopped) { nohz_balance_enter_idle(cpu); calc_load_enter_idle(); + cpu_load_update_nohz_start(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; @@ -802,11 +803,11 @@ out: return tick; } -static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now, int active) +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ tick_do_update_jiffies64(now); - update_cpu_load_nohz(active); + cpu_load_update_nohz_stop(); calc_load_exit_idle(); touch_softlockup_watchdog_sched(); @@ -833,7 +834,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) if (can_stop_full_tick(ts)) tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); else if (ts->tick_stopped) - tick_nohz_restart_sched_tick(ts, ktime_get(), 1); + tick_nohz_restart_sched_tick(ts, ktime_get()); #endif } @@ -1024,7 +1025,7 @@ void tick_nohz_idle_exit(void) tick_nohz_stop_idle(ts, now); if (ts->tick_stopped) { - tick_nohz_restart_sched_tick(ts, now, 0); + tick_nohz_restart_sched_tick(ts, now); tick_nohz_account_idle_ticks(ts); } diff --git a/kernel/torture.c b/kernel/torture.c index 44aa462d033f..fa0bdeee17ac 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -451,6 +451,7 @@ static int torture_shutdown(void *arg) torture_shutdown_hook(); else VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping."); + ftrace_dump(DUMP_ALL); kernel_power_off(); /* Shut down the system. */ return 0; } @@ -602,8 +603,9 @@ bool torture_init_begin(char *ttype, bool v, int *runnable) { mutex_lock(&fullstop_mutex); if (torture_type != NULL) { - pr_alert("torture_init_begin: refusing %s init: %s running", + pr_alert("torture_init_begin: Refusing %s init: %s running.\n", ttype, torture_type); + pr_alert("torture_init_begin: One torture test at a time!\n"); mutex_unlock(&fullstop_mutex); return false; } diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 00df25fd86ef..e11108f1d197 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -47,6 +47,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event, if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) return -EPERM; + if (!is_sampling_event(p_event)) + return 0; + /* * We don't allow user space callchains for function trace * event, due to issues with page faults while tracing page diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3bfdff06eea7..5f5068e94003 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4554,6 +4554,17 @@ static void rebind_workers(struct worker_pool *pool) pool->attrs->cpumask) < 0); spin_lock_irq(&pool->lock); + + /* + * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED + * w/o preceding DOWN_PREPARE. Work around it. CPU hotplug is + * being reworked and this can go away in time. + */ + if (!(pool->flags & POOL_DISASSOCIATED)) { + spin_unlock_irq(&pool->lock); + return; + } + pool->flags &= ~POOL_DISASSOCIATED; for_each_pool_worker(worker, pool) { diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 1e9a607534ca..f4b797a690ba 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1289,6 +1289,39 @@ config TORTURE_TEST tristate default n +config RCU_PERF_TEST + tristate "performance tests for RCU" + depends on DEBUG_KERNEL + select TORTURE_TEST + select SRCU + select TASKS_RCU + default n + help + This option provides a kernel module that runs performance + tests on the RCU infrastructure. The kernel module may be built + after the fact on the running kernel to be tested, if desired. + + Say Y here if you want RCU performance tests to be built into + the kernel. + Say M if you want the RCU performance tests to build as a module. + Say N if you are unsure. + +config RCU_PERF_TEST_RUNNABLE + bool "performance tests for RCU runnable by default" + depends on RCU_PERF_TEST = y + default n + help + This option provides a way to build the RCU performance tests + directly into the kernel without them starting up at boot time. + You can use /sys/module to manually override this setting. + This /proc file is available only when the RCU performance + tests have been built into the kernel. + + Say Y here if you want the RCU performance tests to start during + boot (you probably don't). + Say N here if you want the RCU performance tests to start only + after being manually enabled via /sys/module. + config RCU_TORTURE_TEST tristate "torture tests for RCU" depends on DEBUG_KERNEL diff --git a/lib/Makefile b/lib/Makefile index 7bd6fd436c97..a65e9a861535 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -23,7 +23,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o dump_stack.o timerqueue.o\ idr.o int_sqrt.o extable.o \ sha1.o md5.o irq_regs.o argv_split.o \ - proportions.o flex_proportions.o ratelimit.o show_mem.o \ + flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o seq_buf.o nmi_backtrace.o diff --git a/lib/asn1_decoder.c b/lib/asn1_decoder.c index 2b3f46c049d4..554522934c44 100644 --- a/lib/asn1_decoder.c +++ b/lib/asn1_decoder.c @@ -74,7 +74,7 @@ next_tag: /* Extract a tag from the data */ tag = data[dp++]; - if (tag == 0) { + if (tag == ASN1_EOC) { /* It appears to be an EOC. */ if (data[dp++] != 0) goto invalid_eoc; @@ -96,10 +96,8 @@ next_tag: /* Extract the length */ len = data[dp++]; - if (len <= 0x7f) { - dp += len; - goto next_tag; - } + if (len <= 0x7f) + goto check_length; if (unlikely(len == ASN1_INDEFINITE_LENGTH)) { /* Indefinite length */ @@ -110,14 +108,18 @@ next_tag: } n = len - 0x80; - if (unlikely(n > sizeof(size_t) - 1)) + if (unlikely(n > sizeof(len) - 1)) goto length_too_long; if (unlikely(n > datalen - dp)) goto data_overrun_error; - for (len = 0; n > 0; n--) { + len = 0; + for (; n > 0; n--) { len <<= 8; len |= data[dp++]; } +check_length: + if (len > datalen - dp) + goto data_overrun_error; dp += len; goto next_tag; diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 5fecddc32b1b..ca5316e0087b 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -569,6 +569,25 @@ unsigned long iov_iter_alignment(const struct iov_iter *i) } EXPORT_SYMBOL(iov_iter_alignment); +unsigned long iov_iter_gap_alignment(const struct iov_iter *i) +{ + unsigned long res = 0; + size_t size = i->count; + if (!size) + return 0; + + iterate_all_kinds(i, size, v, + (res |= (!res ? 0 : (unsigned long)v.iov_base) | + (size != v.iov_len ? size : 0), 0), + (res |= (!res ? 0 : (unsigned long)v.bv_offset) | + (size != v.bv_len ? size : 0)), + (res |= (!res ? 0 : (unsigned long)v.iov_base) | + (size != v.iov_len ? size : 0)) + ); + return res; +} +EXPORT_SYMBOL(iov_iter_gap_alignment); + ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start) diff --git a/lib/proportions.c b/lib/proportions.c deleted file mode 100644 index efa54f259ea9..000000000000 --- a/lib/proportions.c +++ /dev/null @@ -1,407 +0,0 @@ -/* - * Floating proportions - * - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra - * - * Description: - * - * The floating proportion is a time derivative with an exponentially decaying - * history: - * - * p_{j} = \Sum_{i=0} (dx_{j}/dt_{-i}) / 2^(1+i) - * - * Where j is an element from {prop_local}, x_{j} is j's number of events, - * and i the time period over which the differential is taken. So d/dt_{-i} is - * the differential over the i-th last period. - * - * The decaying history gives smooth transitions. The time differential carries - * the notion of speed. - * - * The denominator is 2^(1+i) because we want the series to be normalised, ie. - * - * \Sum_{i=0} 1/2^(1+i) = 1 - * - * Further more, if we measure time (t) in the same events as x; so that: - * - * t = \Sum_{j} x_{j} - * - * we get that: - * - * \Sum_{j} p_{j} = 1 - * - * Writing this in an iterative fashion we get (dropping the 'd's): - * - * if (++x_{j}, ++t > period) - * t /= 2; - * for_each (j) - * x_{j} /= 2; - * - * so that: - * - * p_{j} = x_{j} / t; - * - * We optimize away the '/= 2' for the global time delta by noting that: - * - * if (++t > period) t /= 2: - * - * Can be approximated by: - * - * period/2 + (++t % period/2) - * - * [ Furthermore, when we choose period to be 2^n it can be written in terms of - * binary operations and wraparound artefacts disappear. ] - * - * Also note that this yields a natural counter of the elapsed periods: - * - * c = t / (period/2) - * - * [ Its monotonic increasing property can be applied to mitigate the wrap- - * around issue. ] - * - * This allows us to do away with the loop over all prop_locals on each period - * expiration. By remembering the period count under which it was last accessed - * as c_{j}, we can obtain the number of 'missed' cycles from: - * - * c - c_{j} - * - * We can then lazily catch up to the global period count every time we are - * going to use x_{j}, by doing: - * - * x_{j} /= 2^(c - c_{j}), c_{j} = c - */ - -#include <linux/proportions.h> -#include <linux/rcupdate.h> - -int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp) -{ - int err; - - if (shift > PROP_MAX_SHIFT) - shift = PROP_MAX_SHIFT; - - pd->index = 0; - pd->pg[0].shift = shift; - mutex_init(&pd->mutex); - err = percpu_counter_init(&pd->pg[0].events, 0, gfp); - if (err) - goto out; - - err = percpu_counter_init(&pd->pg[1].events, 0, gfp); - if (err) - percpu_counter_destroy(&pd->pg[0].events); - -out: - return err; -} - -/* - * We have two copies, and flip between them to make it seem like an atomic - * update. The update is not really atomic wrt the events counter, but - * it is internally consistent with the bit layout depending on shift. - * - * We copy the events count, move the bits around and flip the index. - */ -void prop_change_shift(struct prop_descriptor *pd, int shift) -{ - int index; - int offset; - u64 events; - unsigned long flags; - - if (shift > PROP_MAX_SHIFT) - shift = PROP_MAX_SHIFT; - - mutex_lock(&pd->mutex); - - index = pd->index ^ 1; - offset = pd->pg[pd->index].shift - shift; - if (!offset) - goto out; - - pd->pg[index].shift = shift; - - local_irq_save(flags); - events = percpu_counter_sum(&pd->pg[pd->index].events); - if (offset < 0) - events <<= -offset; - else - events >>= offset; - percpu_counter_set(&pd->pg[index].events, events); - - /* - * ensure the new pg is fully written before the switch - */ - smp_wmb(); - pd->index = index; - local_irq_restore(flags); - - synchronize_rcu(); - -out: - mutex_unlock(&pd->mutex); -} - -/* - * wrap the access to the data in an rcu_read_lock() section; - * this is used to track the active references. - */ -static struct prop_global *prop_get_global(struct prop_descriptor *pd) -__acquires(RCU) -{ - int index; - - rcu_read_lock(); - index = pd->index; - /* - * match the wmb from vcd_flip() - */ - smp_rmb(); - return &pd->pg[index]; -} - -static void prop_put_global(struct prop_descriptor *pd, struct prop_global *pg) -__releases(RCU) -{ - rcu_read_unlock(); -} - -static void -prop_adjust_shift(int *pl_shift, unsigned long *pl_period, int new_shift) -{ - int offset = *pl_shift - new_shift; - - if (!offset) - return; - - if (offset < 0) - *pl_period <<= -offset; - else - *pl_period >>= offset; - - *pl_shift = new_shift; -} - -/* - * PERCPU - */ - -#define PROP_BATCH (8*(1+ilog2(nr_cpu_ids))) - -int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp) -{ - raw_spin_lock_init(&pl->lock); - pl->shift = 0; - pl->period = 0; - return percpu_counter_init(&pl->events, 0, gfp); -} - -void prop_local_destroy_percpu(struct prop_local_percpu *pl) -{ - percpu_counter_destroy(&pl->events); -} - -/* - * Catch up with missed period expirations. - * - * until (c_{j} == c) - * x_{j} -= x_{j}/2; - * c_{j}++; - */ -static -void prop_norm_percpu(struct prop_global *pg, struct prop_local_percpu *pl) -{ - unsigned long period = 1UL << (pg->shift - 1); - unsigned long period_mask = ~(period - 1); - unsigned long global_period; - unsigned long flags; - - global_period = percpu_counter_read(&pg->events); - global_period &= period_mask; - - /* - * Fast path - check if the local and global period count still match - * outside of the lock. - */ - if (pl->period == global_period) - return; - - raw_spin_lock_irqsave(&pl->lock, flags); - prop_adjust_shift(&pl->shift, &pl->period, pg->shift); - - /* - * For each missed period, we half the local counter. - * basically: - * pl->events >> (global_period - pl->period); - */ - period = (global_period - pl->period) >> (pg->shift - 1); - if (period < BITS_PER_LONG) { - s64 val = percpu_counter_read(&pl->events); - - if (val < (nr_cpu_ids * PROP_BATCH)) - val = percpu_counter_sum(&pl->events); - - __percpu_counter_add(&pl->events, -val + (val >> period), - PROP_BATCH); - } else - percpu_counter_set(&pl->events, 0); - - pl->period = global_period; - raw_spin_unlock_irqrestore(&pl->lock, flags); -} - -/* - * ++x_{j}, ++t - */ -void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl) -{ - struct prop_global *pg = prop_get_global(pd); - - prop_norm_percpu(pg, pl); - __percpu_counter_add(&pl->events, 1, PROP_BATCH); - percpu_counter_add(&pg->events, 1); - prop_put_global(pd, pg); -} - -/* - * identical to __prop_inc_percpu, except that it limits this pl's fraction to - * @frac/PROP_FRAC_BASE by ignoring events when this limit has been exceeded. - */ -void __prop_inc_percpu_max(struct prop_descriptor *pd, - struct prop_local_percpu *pl, long frac) -{ - struct prop_global *pg = prop_get_global(pd); - - prop_norm_percpu(pg, pl); - - if (unlikely(frac != PROP_FRAC_BASE)) { - unsigned long period_2 = 1UL << (pg->shift - 1); - unsigned long counter_mask = period_2 - 1; - unsigned long global_count; - long numerator, denominator; - - numerator = percpu_counter_read_positive(&pl->events); - global_count = percpu_counter_read(&pg->events); - denominator = period_2 + (global_count & counter_mask); - - if (numerator > ((denominator * frac) >> PROP_FRAC_SHIFT)) - goto out_put; - } - - percpu_counter_add(&pl->events, 1); - percpu_counter_add(&pg->events, 1); - -out_put: - prop_put_global(pd, pg); -} - -/* - * Obtain a fraction of this proportion - * - * p_{j} = x_{j} / (period/2 + t % period/2) - */ -void prop_fraction_percpu(struct prop_descriptor *pd, - struct prop_local_percpu *pl, - long *numerator, long *denominator) -{ - struct prop_global *pg = prop_get_global(pd); - unsigned long period_2 = 1UL << (pg->shift - 1); - unsigned long counter_mask = period_2 - 1; - unsigned long global_count; - - prop_norm_percpu(pg, pl); - *numerator = percpu_counter_read_positive(&pl->events); - - global_count = percpu_counter_read(&pg->events); - *denominator = period_2 + (global_count & counter_mask); - - prop_put_global(pd, pg); -} - -/* - * SINGLE - */ - -int prop_local_init_single(struct prop_local_single *pl) -{ - raw_spin_lock_init(&pl->lock); - pl->shift = 0; - pl->period = 0; - pl->events = 0; - return 0; -} - -void prop_local_destroy_single(struct prop_local_single *pl) -{ -} - -/* - * Catch up with missed period expirations. - */ -static -void prop_norm_single(struct prop_global *pg, struct prop_local_single *pl) -{ - unsigned long period = 1UL << (pg->shift - 1); - unsigned long period_mask = ~(period - 1); - unsigned long global_period; - unsigned long flags; - - global_period = percpu_counter_read(&pg->events); - global_period &= period_mask; - - /* - * Fast path - check if the local and global period count still match - * outside of the lock. - */ - if (pl->period == global_period) - return; - - raw_spin_lock_irqsave(&pl->lock, flags); - prop_adjust_shift(&pl->shift, &pl->period, pg->shift); - /* - * For each missed period, we half the local counter. - */ - period = (global_period - pl->period) >> (pg->shift - 1); - if (likely(period < BITS_PER_LONG)) - pl->events >>= period; - else - pl->events = 0; - pl->period = global_period; - raw_spin_unlock_irqrestore(&pl->lock, flags); -} - -/* - * ++x_{j}, ++t - */ -void __prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl) -{ - struct prop_global *pg = prop_get_global(pd); - - prop_norm_single(pg, pl); - pl->events++; - percpu_counter_add(&pg->events, 1); - prop_put_global(pd, pg); -} - -/* - * Obtain a fraction of this proportion - * - * p_{j} = x_{j} / (period/2 + t % period/2) - */ -void prop_fraction_single(struct prop_descriptor *pd, - struct prop_local_single *pl, - long *numerator, long *denominator) -{ - struct prop_global *pg = prop_get_global(pd); - unsigned long period_2 = 1UL << (pg->shift - 1); - unsigned long counter_mask = period_2 - 1; - unsigned long global_count; - - prop_norm_single(pg, pl); - *numerator = pl->events; - - global_count = percpu_counter_read(&pg->events); - *denominator = period_2 + (global_count & counter_mask); - - prop_put_global(pd, pg); -} diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f7daa7de8f48..b49ee126d4d1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1298,15 +1298,9 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); /* * We can only reuse the page if nobody else maps the huge page or it's - * part. We can do it by checking page_mapcount() on each sub-page, but - * it's expensive. - * The cheaper way is to check page_count() to be equal 1: every - * mapcount takes page reference reference, so this way we can - * guarantee, that the PMD is the only mapping. - * This can give false negative if somebody pinned the page, but that's - * fine. + * part. */ - if (page_mapcount(page) == 1 && page_count(page) == 1) { + if (page_trans_huge_mapcount(page, NULL) == 1) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); @@ -2079,7 +2073,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, if (pte_write(pteval)) { writable = true; } else { - if (PageSwapCache(page) && !reuse_swap_page(page)) { + if (PageSwapCache(page) && + !reuse_swap_page(page, NULL)) { unlock_page(page); result = SCAN_SWAP_CACHE_PAGE; goto out; @@ -3223,6 +3218,64 @@ int total_mapcount(struct page *page) } /* + * This calculates accurately how many mappings a transparent hugepage + * has (unlike page_mapcount() which isn't fully accurate). This full + * accuracy is primarily needed to know if copy-on-write faults can + * reuse the page and change the mapping to read-write instead of + * copying them. At the same time this returns the total_mapcount too. + * + * The function returns the highest mapcount any one of the subpages + * has. If the return value is one, even if different processes are + * mapping different subpages of the transparent hugepage, they can + * all reuse it, because each process is reusing a different subpage. + * + * The total_mapcount is instead counting all virtual mappings of the + * subpages. If the total_mapcount is equal to "one", it tells the + * caller all mappings belong to the same "mm" and in turn the + * anon_vma of the transparent hugepage can become the vma->anon_vma + * local one as no other process may be mapping any of the subpages. + * + * It would be more accurate to replace page_mapcount() with + * page_trans_huge_mapcount(), however we only use + * page_trans_huge_mapcount() in the copy-on-write faults where we + * need full accuracy to avoid breaking page pinning, because + * page_trans_huge_mapcount() is slower than page_mapcount(). + */ +int page_trans_huge_mapcount(struct page *page, int *total_mapcount) +{ + int i, ret, _total_mapcount, mapcount; + + /* hugetlbfs shouldn't call it */ + VM_BUG_ON_PAGE(PageHuge(page), page); + + if (likely(!PageTransCompound(page))) { + mapcount = atomic_read(&page->_mapcount) + 1; + if (total_mapcount) + *total_mapcount = mapcount; + return mapcount; + } + + page = compound_head(page); + + _total_mapcount = ret = 0; + for (i = 0; i < HPAGE_PMD_NR; i++) { + mapcount = atomic_read(&page[i]._mapcount) + 1; + ret = max(ret, mapcount); + _total_mapcount += mapcount; + } + if (PageDoubleMap(page)) { + ret -= 1; + _total_mapcount -= HPAGE_PMD_NR; + } + mapcount = compound_mapcount(page); + ret += mapcount; + _total_mapcount += mapcount; + if (total_mapcount) + *total_mapcount = _total_mapcount; + return ret; +} + +/* * This function splits huge page into normal pages. @page can point to any * subpage of huge page to split. Split doesn't change the position of @page. * @@ -783,6 +783,7 @@ static int unmerge_and_remove_all_rmap_items(void) } remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list); + up_read(&mm->mmap_sem); spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, @@ -794,12 +795,9 @@ static int unmerge_and_remove_all_rmap_items(void) free_mm_slot(mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); - up_read(&mm->mmap_sem); mmdrop(mm); - } else { + } else spin_unlock(&ksm_mmlist_lock); - up_read(&mm->mmap_sem); - } } /* Clean up stable nodes, but don't worry if some are still busy */ @@ -1663,8 +1661,15 @@ next_mm: up_read(&mm->mmap_sem); mmdrop(mm); } else { - spin_unlock(&ksm_mmlist_lock); up_read(&mm->mmap_sem); + /* + * up_read(&mm->mmap_sem) first because after + * spin_unlock(&ksm_mmlist_lock) run, the "mm" may + * already have been freed under us by __ksm_exit() + * because the "mm_slot" is still hashed and + * ksm_scan.mm_slot doesn't point to it anymore. + */ + spin_unlock(&ksm_mmlist_lock); } /* Repeat until we've completed scanning the whole list */ diff --git a/mm/memory.c b/mm/memory.c index 52c218e2b724..07493e34ab7e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2373,6 +2373,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * not dirty accountable. */ if (PageAnon(old_page) && !PageKsm(old_page)) { + int total_mapcount; if (!trylock_page(old_page)) { get_page(old_page); pte_unmap_unlock(page_table, ptl); @@ -2387,13 +2388,18 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, } put_page(old_page); } - if (reuse_swap_page(old_page)) { - /* - * The page is all ours. Move it to our anon_vma so - * the rmap code will not search our parent or siblings. - * Protected against the rmap code by the page lock. - */ - page_move_anon_rmap(old_page, vma, address); + if (reuse_swap_page(old_page, &total_mapcount)) { + if (total_mapcount == 1) { + /* + * The page is all ours. Move it to + * our anon_vma so the rmap code will + * not search our parent or siblings. + * Protected against the rmap code by + * the page lock. + */ + page_move_anon_rmap(compound_head(old_page), + vma, address); + } unlock_page(old_page); return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte, old_page, 0, 0); @@ -2617,7 +2623,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, inc_mm_counter_fast(mm, MM_ANONPAGES); dec_mm_counter_fast(mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); - if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { + if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; diff --git a/mm/mmu_context.c b/mm/mmu_context.c index f802c2d216a7..6f4d27c5bb32 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c @@ -4,9 +4,9 @@ */ #include <linux/mm.h> +#include <linux/sched.h> #include <linux/mmu_context.h> #include <linux/export.h> -#include <linux/sched.h> #include <asm/mmu_context.h> diff --git a/mm/swapfile.c b/mm/swapfile.c index 83874eced5bf..031713ab40ce 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -922,18 +922,19 @@ out: * to it. And as a side-effect, free up its swap: because the old content * on disk will never be read, and seeking back there to write new content * later would only waste time away from clustering. + * + * NOTE: total_mapcount should not be relied upon by the caller if + * reuse_swap_page() returns false, but it may be always overwritten + * (see the other implementation for CONFIG_SWAP=n). */ -int reuse_swap_page(struct page *page) +bool reuse_swap_page(struct page *page, int *total_mapcount) { int count; VM_BUG_ON_PAGE(!PageLocked(page), page); if (unlikely(PageKsm(page))) - return 0; - /* The page is part of THP and cannot be reused */ - if (PageTransCompound(page)) - return 0; - count = page_mapcount(page); + return false; + count = page_trans_huge_mapcount(page, total_mapcount); if (count <= 1 && PageSwapCache(page)) { count += page_swapcount(page); if (count == 1 && !PageWriteback(page)) { diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index d97268e8ff10..2b68418c7198 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -975,6 +975,8 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) val = 65535 - 40; if (type == RTAX_MTU && val > 65535 - 15) val = 65535 - 15; + if (type == RTAX_HOPLIMIT && val > 255) + val = 255; if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) return -EINVAL; fi->fib_metrics[type - 1] = val; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d916d6ab9ad2..6f32944e0223 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1750,6 +1750,8 @@ static int ip6_convert_metrics(struct mx6_config *mxc, } else { val = nla_get_u32(nla); } + if (type == RTAX_HOPLIMIT && val > 255) + val = 255; if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) goto err; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 895d11dced3c..e27fd17c6743 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1778,6 +1778,7 @@ void nf_conntrack_init_end(void) int nf_conntrack_init_net(struct net *net) { + static atomic64_t unique_id; int ret = -ENOMEM; int cpu; @@ -1800,7 +1801,8 @@ int nf_conntrack_init_net(struct net *net) if (!net->ct.stat) goto err_pcpu_lists; - net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net); + net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%llu", + (u64)atomic64_inc_return(&unique_id)); if (!net->ct.slabname) goto err_slabname; diff --git a/sound/pci/hda/hda_sysfs.c b/sound/pci/hda/hda_sysfs.c index 64e0d1d81ca5..9739fce9e032 100644 --- a/sound/pci/hda/hda_sysfs.c +++ b/sound/pci/hda/hda_sysfs.c @@ -141,14 +141,6 @@ static int reconfig_codec(struct hda_codec *codec) err = snd_hda_codec_configure(codec); if (err < 0) goto error; - /* rebuild PCMs */ - err = snd_hda_codec_build_pcms(codec); - if (err < 0) - goto error; - /* rebuild mixers */ - err = snd_hda_codec_build_controls(codec); - if (err < 0) - goto error; err = snd_card_register(codec->card); error: snd_hda_power_down(codec); diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c index 1483f85999ec..a010d704e0e2 100644 --- a/sound/pci/hda/patch_hdmi.c +++ b/sound/pci/hda/patch_hdmi.c @@ -3401,6 +3401,9 @@ static int patch_atihdmi(struct hda_codec *codec) spec->ops.pin_hbr_setup = atihdmi_pin_hbr_setup; spec->ops.setup_stream = atihdmi_setup_stream; + spec->chmap.ops.pin_get_slot_channel = atihdmi_pin_get_slot_channel; + spec->chmap.ops.pin_set_slot_channel = atihdmi_pin_set_slot_channel; + if (!has_amd_full_remap_support(codec)) { /* override to ATI/AMD-specific versions with pairwise mapping */ spec->chmap.ops.chmap_cea_alloc_validate_get_type = @@ -3408,10 +3411,6 @@ static int patch_atihdmi(struct hda_codec *codec) spec->chmap.ops.cea_alloc_to_tlv_chmap = atihdmi_paired_cea_alloc_to_tlv_chmap; spec->chmap.ops.chmap_validate = atihdmi_paired_chmap_validate; - spec->chmap.ops.pin_get_slot_channel = - atihdmi_pin_get_slot_channel; - spec->chmap.ops.pin_set_slot_channel = - atihdmi_pin_set_slot_channel; } /* ATI/AMD converters do not advertise all of their capabilities */ diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index ac4490a96863..4918ffa5ba68 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6426,6 +6426,7 @@ enum { ALC668_FIXUP_DELL_DISABLE_AAMIX, ALC668_FIXUP_DELL_XPS13, ALC662_FIXUP_ASUS_Nx50, + ALC668_FIXUP_ASUS_Nx51, }; static const struct hda_fixup alc662_fixups[] = { @@ -6672,6 +6673,15 @@ static const struct hda_fixup alc662_fixups[] = { .chained = true, .chain_id = ALC662_FIXUP_BASS_1A }, + [ALC668_FIXUP_ASUS_Nx51] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + {0x1a, 0x90170151}, /* bass speaker */ + {} + }, + .chained = true, + .chain_id = ALC662_FIXUP_BASS_CHMAP, + }, }; static const struct snd_pci_quirk alc662_fixup_tbl[] = { @@ -6694,11 +6704,14 @@ static const struct snd_pci_quirk alc662_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x0698, "Dell", ALC668_FIXUP_DELL_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x069f, "Dell", ALC668_FIXUP_DELL_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x103c, 0x1632, "HP RP5800", ALC662_FIXUP_HP_RP5800), + SND_PCI_QUIRK(0x1043, 0x1080, "Asus UX501VW", ALC668_FIXUP_HEADSET_MODE), SND_PCI_QUIRK(0x1043, 0x11cd, "Asus N550", ALC662_FIXUP_ASUS_Nx50), SND_PCI_QUIRK(0x1043, 0x13df, "Asus N550JX", ALC662_FIXUP_BASS_1A), SND_PCI_QUIRK(0x1043, 0x129d, "Asus N750", ALC662_FIXUP_ASUS_Nx50), SND_PCI_QUIRK(0x1043, 0x1477, "ASUS N56VZ", ALC662_FIXUP_BASS_MODE4_CHMAP), SND_PCI_QUIRK(0x1043, 0x15a7, "ASUS UX51VZH", ALC662_FIXUP_BASS_16), + SND_PCI_QUIRK(0x1043, 0x177d, "ASUS N551", ALC668_FIXUP_ASUS_Nx51), + SND_PCI_QUIRK(0x1043, 0x17bd, "ASUS N751", ALC668_FIXUP_ASUS_Nx51), SND_PCI_QUIRK(0x1043, 0x1b73, "ASUS N55SF", ALC662_FIXUP_BASS_16), SND_PCI_QUIRK(0x1043, 0x1bf3, "ASUS N76VZ", ALC662_FIXUP_BASS_MODE4_CHMAP), SND_PCI_QUIRK(0x1043, 0x8469, "ASUS mobo", ALC662_FIXUP_NO_JACK_DETECT), diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 0adfd9537cf7..6adde457b602 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1137,8 +1137,11 @@ bool snd_usb_get_sample_rate_quirk(struct snd_usb_audio *chip) case USB_ID(0x047F, 0x0415): /* Plantronics BT-300 */ case USB_ID(0x047F, 0xAA05): /* Plantronics DA45 */ case USB_ID(0x04D8, 0xFEEA): /* Benchmark DAC1 Pre */ + case USB_ID(0x0556, 0x0014): /* Phoenix Audio TMX320VC */ case USB_ID(0x074D, 0x3553): /* Outlaw RR2150 (Micronas UAC3553B) */ + case USB_ID(0x1de7, 0x0013): /* Phoenix Audio MT202exe */ case USB_ID(0x1de7, 0x0014): /* Phoenix Audio TMX320 */ + case USB_ID(0x1de7, 0x0114): /* Phoenix Audio MT202pcs */ case USB_ID(0x21B4, 0x0081): /* AudioQuest DragonFly */ return true; } diff --git a/tools/Makefile b/tools/Makefile index 60c7e6c8ff17..6bf68fe7dd29 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -137,7 +137,8 @@ libsubcmd_clean: $(call descend,lib/subcmd,clean) perf_clean: - $(call descend,$(@:_clean=),clean) + $(Q)mkdir -p $(PERF_O) . + $(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= clean selftests_clean: $(call descend,testing/$(@:_clean=),clean) diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature index 6b7707270aa3..57c8f98874e8 100644 --- a/tools/build/Makefile.feature +++ b/tools/build/Makefile.feature @@ -30,6 +30,7 @@ endef FEATURE_TESTS_BASIC := \ backtrace \ dwarf \ + dwarf_getlocations \ fortify-source \ sync-compare-and-swap \ glibc \ @@ -48,6 +49,10 @@ FEATURE_TESTS_BASIC := \ libslang \ libcrypto \ libunwind \ + libunwind-x86 \ + libunwind-x86_64 \ + libunwind-arm \ + libunwind-aarch64 \ pthread-attr-setaffinity-np \ stackprotector-all \ timerfd \ @@ -68,7 +73,9 @@ FEATURE_TESTS_EXTRA := \ libbabeltrace \ liberty \ liberty-z \ - libunwind-debug-frame + libunwind-debug-frame \ + libunwind-debug-frame-arm \ + libunwind-debug-frame-aarch64 FEATURE_TESTS ?= $(FEATURE_TESTS_BASIC) @@ -78,6 +85,7 @@ endif FEATURE_DISPLAY ?= \ dwarf \ + dwarf_getlocations \ glibc \ gtk2 \ libaudit \ diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index c5f4c417428d..3d88f09e188b 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -3,6 +3,7 @@ FILES= \ test-backtrace.bin \ test-bionic.bin \ test-dwarf.bin \ + test-dwarf_getlocations.bin \ test-fortify-source.bin \ test-sync-compare-and-swap.bin \ test-glibc.bin \ @@ -26,6 +27,12 @@ FILES= \ test-libcrypto.bin \ test-libunwind.bin \ test-libunwind-debug-frame.bin \ + test-libunwind-x86.bin \ + test-libunwind-x86_64.bin \ + test-libunwind-arm.bin \ + test-libunwind-aarch64.bin \ + test-libunwind-debug-frame-arm.bin \ + test-libunwind-debug-frame-aarch64.bin \ test-pthread-attr-setaffinity-np.bin \ test-stackprotector-all.bin \ test-timerfd.bin \ @@ -82,6 +89,9 @@ endif $(OUTPUT)test-dwarf.bin: $(BUILD) $(DWARFLIBS) +$(OUTPUT)test-dwarf_getlocations.bin: + $(BUILD) $(DWARFLIBS) + $(OUTPUT)test-libelf-mmap.bin: $(BUILD) -lelf @@ -99,6 +109,23 @@ $(OUTPUT)test-libunwind.bin: $(OUTPUT)test-libunwind-debug-frame.bin: $(BUILD) -lelf +$(OUTPUT)test-libunwind-x86.bin: + $(BUILD) -lelf -lunwind-x86 + +$(OUTPUT)test-libunwind-x86_64.bin: + $(BUILD) -lelf -lunwind-x86_64 + +$(OUTPUT)test-libunwind-arm.bin: + $(BUILD) -lelf -lunwind-arm + +$(OUTPUT)test-libunwind-aarch64.bin: + $(BUILD) -lelf -lunwind-aarch64 + +$(OUTPUT)test-libunwind-debug-frame-arm.bin: + $(BUILD) -lelf -lunwind-arm + +$(OUTPUT)test-libunwind-debug-frame-aarch64.bin: + $(BUILD) -lelf -lunwind-aarch64 $(OUTPUT)test-libaudit.bin: $(BUILD) -laudit diff --git a/tools/build/feature/test-all.c b/tools/build/feature/test-all.c index e499a36c1e4a..a282e8cb84f3 100644 --- a/tools/build/feature/test-all.c +++ b/tools/build/feature/test-all.c @@ -41,6 +41,10 @@ # include "test-dwarf.c" #undef main +#define main main_test_dwarf_getlocations +# include "test-dwarf_getlocations.c" +#undef main + #define main main_test_libelf_getphdrnum # include "test-libelf-getphdrnum.c" #undef main @@ -143,6 +147,7 @@ int main(int argc, char *argv[]) main_test_libelf_mmap(); main_test_glibc(); main_test_dwarf(); + main_test_dwarf_getlocations(); main_test_libelf_getphdrnum(); main_test_libunwind(); main_test_libaudit(); diff --git a/tools/build/feature/test-bpf.c b/tools/build/feature/test-bpf.c index b389026839b9..e04ab89a1013 100644 --- a/tools/build/feature/test-bpf.c +++ b/tools/build/feature/test-bpf.c @@ -27,10 +27,9 @@ int main(void) attr.log_level = 0; attr.kern_version = 0; - attr = attr; /* * Test existence of __NR_bpf and BPF_PROG_LOAD. * This call should fail if we run the testcase. */ - return syscall(__NR_bpf, BPF_PROG_LOAD, attr, sizeof(attr)); + return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); } diff --git a/tools/build/feature/test-dwarf_getlocations.c b/tools/build/feature/test-dwarf_getlocations.c new file mode 100644 index 000000000000..70162699dd43 --- /dev/null +++ b/tools/build/feature/test-dwarf_getlocations.c @@ -0,0 +1,12 @@ +#include <stdlib.h> +#include <elfutils/libdw.h> + +int main(void) +{ + Dwarf_Addr base, start, end; + Dwarf_Attribute attr; + Dwarf_Op *op; + size_t nops; + ptrdiff_t offset = 0; + return (int)dwarf_getlocations(&attr, offset, &base, &start, &end, &op, &nops); +} diff --git a/tools/build/feature/test-libunwind-aarch64.c b/tools/build/feature/test-libunwind-aarch64.c new file mode 100644 index 000000000000..fc03fb64e8c1 --- /dev/null +++ b/tools/build/feature/test-libunwind-aarch64.c @@ -0,0 +1,26 @@ +#include <libunwind-aarch64.h> +#include <stdlib.h> + +extern int UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as, + unw_word_t ip, + unw_dyn_info_t *di, + unw_proc_info_t *pi, + int need_unwind_info, void *arg); + +#define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table) + +static unw_accessors_t accessors; + +int main(void) +{ + unw_addr_space_t addr_space; + + addr_space = unw_create_addr_space(&accessors, 0); + if (addr_space) + return 0; + + unw_init_remote(NULL, addr_space, NULL); + dwarf_search_unwind_table(addr_space, 0, NULL, NULL, 0, NULL); + + return 0; +} diff --git a/tools/build/feature/test-libunwind-arm.c b/tools/build/feature/test-libunwind-arm.c new file mode 100644 index 000000000000..632d95ec641f --- /dev/null +++ b/tools/build/feature/test-libunwind-arm.c @@ -0,0 +1,27 @@ +#include <libunwind-arm.h> +#include <stdlib.h> + +extern int UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as, + unw_word_t ip, + unw_dyn_info_t *di, + unw_proc_info_t *pi, + int need_unwind_info, void *arg); + + +#define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table) + +static unw_accessors_t accessors; + +int main(void) +{ + unw_addr_space_t addr_space; + + addr_space = unw_create_addr_space(&accessors, 0); + if (addr_space) + return 0; + + unw_init_remote(NULL, addr_space, NULL); + dwarf_search_unwind_table(addr_space, 0, NULL, NULL, 0, NULL); + + return 0; +} diff --git a/tools/build/feature/test-libunwind-debug-frame-aarch64.c b/tools/build/feature/test-libunwind-debug-frame-aarch64.c new file mode 100644 index 000000000000..22844673fc26 --- /dev/null +++ b/tools/build/feature/test-libunwind-debug-frame-aarch64.c @@ -0,0 +1,16 @@ +#include <libunwind-aarch64.h> +#include <stdlib.h> + +extern int +UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug, + unw_word_t ip, unw_word_t segbase, + const char *obj_name, unw_word_t start, + unw_word_t end); + +#define dwarf_find_debug_frame UNW_OBJ(dwarf_find_debug_frame) + +int main(void) +{ + dwarf_find_debug_frame(0, NULL, 0, 0, NULL, 0, 0); + return 0; +} diff --git a/tools/build/feature/test-libunwind-debug-frame-arm.c b/tools/build/feature/test-libunwind-debug-frame-arm.c new file mode 100644 index 000000000000..f98859684fee --- /dev/null +++ b/tools/build/feature/test-libunwind-debug-frame-arm.c @@ -0,0 +1,16 @@ +#include <libunwind-arm.h> +#include <stdlib.h> + +extern int +UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug, + unw_word_t ip, unw_word_t segbase, + const char *obj_name, unw_word_t start, + unw_word_t end); + +#define dwarf_find_debug_frame UNW_OBJ(dwarf_find_debug_frame) + +int main(void) +{ + dwarf_find_debug_frame(0, NULL, 0, 0, NULL, 0, 0); + return 0; +} diff --git a/tools/build/feature/test-libunwind-x86.c b/tools/build/feature/test-libunwind-x86.c new file mode 100644 index 000000000000..3561edce305e --- /dev/null +++ b/tools/build/feature/test-libunwind-x86.c @@ -0,0 +1,27 @@ +#include <libunwind-x86.h> +#include <stdlib.h> + +extern int UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as, + unw_word_t ip, + unw_dyn_info_t *di, + unw_proc_info_t *pi, + int need_unwind_info, void *arg); + + +#define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table) + +static unw_accessors_t accessors; + +int main(void) +{ + unw_addr_space_t addr_space; + + addr_space = unw_create_addr_space(&accessors, 0); + if (addr_space) + return 0; + + unw_init_remote(NULL, addr_space, NULL); + dwarf_search_unwind_table(addr_space, 0, NULL, NULL, 0, NULL); + + return 0; +} diff --git a/tools/build/feature/test-libunwind-x86_64.c b/tools/build/feature/test-libunwind-x86_64.c new file mode 100644 index 000000000000..5add2517b2a1 --- /dev/null +++ b/tools/build/feature/test-libunwind-x86_64.c @@ -0,0 +1,27 @@ +#include <libunwind-x86_64.h> +#include <stdlib.h> + +extern int UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as, + unw_word_t ip, + unw_dyn_info_t *di, + unw_proc_info_t *pi, + int need_unwind_info, void *arg); + + +#define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table) + +static unw_accessors_t accessors; + +int main(void) +{ + unw_addr_space_t addr_space; + + addr_space = unw_create_addr_space(&accessors, 0); + if (addr_space) + return 0; + + unw_init_remote(NULL, addr_space, NULL); + dwarf_search_unwind_table(addr_space, 0, NULL, NULL, 0, NULL); + + return 0; +} diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c index 80159e6811c2..d9836c5eb694 100644 --- a/tools/lguest/lguest.c +++ b/tools/lguest/lguest.c @@ -3351,12 +3351,18 @@ int main(int argc, char *argv[]) /* Boot protocol version: 2.07 supports the fields for lguest. */ boot->hdr.version = 0x207; - /* The hardware_subarch value of "1" tells the Guest it's an lguest. */ - boot->hdr.hardware_subarch = 1; + /* X86_SUBARCH_LGUEST tells the Guest it's an lguest. */ + boot->hdr.hardware_subarch = X86_SUBARCH_LGUEST; /* Tell the entry path not to try to reload segment registers. */ boot->hdr.loadflags |= KEEP_SEGMENTS; + /* We don't support tboot: */ + boot->tboot_addr = 0; + + /* Ensure this is 0 to prevent APM from loading: */ + boot->apm_bios_info.version = 0; + /* We tell the kernel to initialize the Guest. */ tell_kernel(start); diff --git a/tools/lib/api/fs/fs.c b/tools/lib/api/fs/fs.c index ef78c22ff44d..08556cf2c70d 100644 --- a/tools/lib/api/fs/fs.c +++ b/tools/lib/api/fs/fs.c @@ -351,6 +351,19 @@ int filename__read_str(const char *filename, char **buf, size_t *sizep) return err; } +int procfs__read_str(const char *entry, char **buf, size_t *sizep) +{ + char path[PATH_MAX]; + const char *procfs = procfs__mountpoint(); + + if (!procfs) + return -1; + + snprintf(path, sizeof(path), "%s/%s", procfs, entry); + + return filename__read_str(path, buf, sizep); +} + int sysfs__read_ull(const char *entry, unsigned long long *value) { char path[PATH_MAX]; diff --git a/tools/lib/api/fs/fs.h b/tools/lib/api/fs/fs.h index 9f6598098dc5..16c9c2ed7c5b 100644 --- a/tools/lib/api/fs/fs.h +++ b/tools/lib/api/fs/fs.h @@ -29,6 +29,8 @@ int filename__read_int(const char *filename, int *value); int filename__read_ull(const char *filename, unsigned long long *value); int filename__read_str(const char *filename, char **buf, size_t *sizep); +int procfs__read_str(const char *entry, char **buf, size_t *sizep); + int sysctl__read_int(const char *sysctl, int *value); int sysfs__read_int(const char *entry, int *value); int sysfs__read_ull(const char *entry, unsigned long long *value); diff --git a/tools/lib/traceevent/parse-filter.c b/tools/lib/traceevent/parse-filter.c index 0144b3d1bb77..88cccea3ca99 100644 --- a/tools/lib/traceevent/parse-filter.c +++ b/tools/lib/traceevent/parse-filter.c @@ -1164,11 +1164,11 @@ process_filter(struct event_format *event, struct filter_arg **parg, current_op = current_exp; ret = collapse_tree(current_op, parg, error_str); + /* collapse_tree() may free current_op, and updates parg accordingly */ + current_op = NULL; if (ret < 0) goto fail; - *parg = current_op; - free(token); return 0; diff --git a/tools/perf/Documentation/intel-pt.txt b/tools/perf/Documentation/intel-pt.txt index be764f9ec769..c6c8318e38a2 100644 --- a/tools/perf/Documentation/intel-pt.txt +++ b/tools/perf/Documentation/intel-pt.txt @@ -672,6 +672,7 @@ The letters are: d create a debug log g synthesize a call chain (use with i or x) l synthesize last branch entries (use with i or x) + s skip initial number of events "Instructions" events look like they were recorded by "perf record -e instructions". @@ -730,6 +731,12 @@ from one sample to the next. To disable trace decoding entirely, use the option --no-itrace. +It is also possible to skip events generated (instructions, branches, transactions) +at the beginning. This is useful to ignore initialization code. + + --itrace=i0nss1000000 + +skips the first million instructions. dump option ----------- diff --git a/tools/perf/Documentation/itrace.txt b/tools/perf/Documentation/itrace.txt index 65453f4c7006..e2a4c5e0dbe5 100644 --- a/tools/perf/Documentation/itrace.txt +++ b/tools/perf/Documentation/itrace.txt @@ -7,6 +7,7 @@ d create a debug log g synthesize a call chain (use with i or x) l synthesize last branch entries (use with i or x) + s skip initial number of events The default is all events i.e. the same as --itrace=ibxe @@ -24,3 +25,10 @@ Also the number of last branch entries (default 64, max. 1024) for instructions or transactions events can be specified. + + It is also possible to skip events generated (instructions, branches, transactions) + at the beginning. This is useful to ignore initialization code. + + --itrace=i0nss1000000 + + skips the first million instructions. diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt index e9cd39a92dc2..778f54d4d0bd 100644 --- a/tools/perf/Documentation/perf-annotate.txt +++ b/tools/perf/Documentation/perf-annotate.txt @@ -33,7 +33,7 @@ OPTIONS -f:: --force:: - Don't complain, do it. + Don't do ownership validation. -v:: --verbose:: diff --git a/tools/perf/Documentation/perf-diff.txt b/tools/perf/Documentation/perf-diff.txt index d1deb573877f..3e9490b9c533 100644 --- a/tools/perf/Documentation/perf-diff.txt +++ b/tools/perf/Documentation/perf-diff.txt @@ -75,7 +75,7 @@ OPTIONS -f:: --force:: - Don't complain, do it. + Don't do ownership validation. --symfs=<directory>:: Look for files with symbols relative to this directory. diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt index ec723d0a5bb3..a126e97a8114 100644 --- a/tools/perf/Documentation/perf-list.txt +++ b/tools/perf/Documentation/perf-list.txt @@ -93,6 +93,67 @@ raw encoding of 0x1A8 can be used: You should refer to the processor specific documentation for getting these details. Some of them are referenced in the SEE ALSO section below. +ARBITRARY PMUS +-------------- + +perf also supports an extended syntax for specifying raw parameters +to PMUs. Using this typically requires looking up the specific event +in the CPU vendor specific documentation. + +The available PMUs and their raw parameters can be listed with + + ls /sys/devices/*/format + +For example the raw event "LSD.UOPS" core pmu event above could +be specified as + + perf stat -e cpu/event=0xa8,umask=0x1,name=LSD.UOPS_CYCLES,cmask=1/ ... + +PER SOCKET PMUS +--------------- + +Some PMUs are not associated with a core, but with a whole CPU socket. +Events on these PMUs generally cannot be sampled, but only counted globally +with perf stat -a. They can be bound to one logical CPU, but will measure +all the CPUs in the same socket. + +This example measures memory bandwidth every second +on the first memory controller on socket 0 of a Intel Xeon system + + perf stat -C 0 -a uncore_imc_0/cas_count_read/,uncore_imc_0/cas_count_write/ -I 1000 ... + +Each memory controller has its own PMU. Measuring the complete system +bandwidth would require specifying all imc PMUs (see perf list output), +and adding the values together. + +This example measures the combined core power every second + + perf stat -I 1000 -e power/energy-cores/ -a + +ACCESS RESTRICTIONS +------------------- + +For non root users generally only context switched PMU events are available. +This is normally only the events in the cpu PMU, the predefined events +like cycles and instructions and some software events. + +Other PMUs and global measurements are normally root only. +Some event qualifiers, such as "any", are also root only. + +This can be overriden by setting the kernel.perf_event_paranoid +sysctl to -1, which allows non root to use these events. + +For accessing trace point events perf needs to have read access to +/sys/kernel/debug/tracing, even when perf_event_paranoid is in a relaxed +setting. + +TRACING +------- + +Some PMUs control advanced hardware tracing capabilities, such as Intel PT, +that allows low overhead execution tracing. These are described in a separate +intel-pt.txt document. + PARAMETERIZED EVENTS -------------------- @@ -106,6 +167,50 @@ also be supplied. For example: perf stat -C 0 -e 'hv_gpci/dtbp_ptitc,phys_processor_idx=0x2/' ... +EVENT GROUPS +------------ + +Perf supports time based multiplexing of events, when the number of events +active exceeds the number of hardware performance counters. Multiplexing +can cause measurement errors when the workload changes its execution +profile. + +When metrics are computed using formulas from event counts, it is useful to +ensure some events are always measured together as a group to minimize multiplexing +errors. Event groups can be specified using { }. + + perf stat -e '{instructions,cycles}' ... + +The number of available performance counters depend on the CPU. A group +cannot contain more events than available counters. +For example Intel Core CPUs typically have four generic performance counters +for the core, plus three fixed counters for instructions, cycles and +ref-cycles. Some special events have restrictions on which counter they +can schedule, and may not support multiple instances in a single group. +When too many events are specified in the group none of them will not +be measured. + +Globally pinned events can limit the number of counters available for +other groups. On x86 systems, the NMI watchdog pins a counter by default. +The nmi watchdog can be disabled as root with + + echo 0 > /proc/sys/kernel/nmi_watchdog + +Events from multiple different PMUs cannot be mixed in a group, with +some exceptions for software events. + +LEADER SAMPLING +--------------- + +perf also supports group leader sampling using the :S specifier. + + perf record -e '{cycles,instructions}:S' ... + perf report --group + +Normally all events in a event group sample, but with :S only +the first event (the leader) samples, and it only reads the values of the +other events in the group. + OPTIONS ------- @@ -143,5 +248,5 @@ SEE ALSO -------- linkperf:perf-stat[1], linkperf:perf-top[1], linkperf:perf-record[1], -http://www.intel.com/Assets/PDF/manual/253669.pdf[Intel® 64 and IA-32 Architectures Software Developer's Manual Volume 3B: System Programming Guide], +http://www.intel.com/sdm/[Intel® 64 and IA-32 Architectures Software Developer's Manual Volume 3B: System Programming Guide], http://support.amd.com/us/Processor_TechDocs/24593_APM_v2.pdf[AMD64 Architecture Programmer’s Manual Volume 2: System Programming] diff --git a/tools/perf/Documentation/perf-mem.txt b/tools/perf/Documentation/perf-mem.txt index 43310d8661fe..1d6092c460dd 100644 --- a/tools/perf/Documentation/perf-mem.txt +++ b/tools/perf/Documentation/perf-mem.txt @@ -48,6 +48,14 @@ OPTIONS option can be passed in record mode. It will be interpreted the same way as perf record. +-K:: +--all-kernel:: + Configure all used events to run in kernel space. + +-U:: +--all-user:: + Configure all used events to run in user space. + SEE ALSO -------- linkperf:perf-record[1], linkperf:perf-report[1] diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 19aa17532a16..8dbee832abd9 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -347,6 +347,19 @@ Configure all used events to run in kernel space. --all-user:: Configure all used events to run in user space. +--timestamp-filename +Append timestamp to output file name. + +--switch-output:: +Generate multiple perf.data files, timestamp prefixed, switching to a new one +when receiving a SIGUSR2. + +A possible use case is to, given an external event, slice the perf.data file +that gets then processed, possibly via a perf script, to decide if that +particular perf.data snapshot should be kept or not. + +Implies --timestamp-filename, --no-buildid and --no-buildid-cache. + SEE ALSO -------- linkperf:perf-stat[1], linkperf:perf-list[1] diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index 12113992ac9d..ebaf849e30ef 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -248,7 +248,7 @@ OPTIONS Note that when using the --itrace option the synthesized callchain size will override this value if the synthesized callchain size is bigger. - Default: 127 + Default: /proc/sys/kernel/perf_event_max_stack when present, 127 otherwise. -G:: --inverted:: @@ -285,7 +285,7 @@ OPTIONS -f:: --force:: - Don't complain, do it. + Don't do ownership validation. --symfs=<directory>:: Look for files with symbols relative to this directory. diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt index 8ff4df956951..1cc08cc47ac5 100644 --- a/tools/perf/Documentation/perf-sched.txt +++ b/tools/perf/Documentation/perf-sched.txt @@ -50,6 +50,22 @@ OPTIONS --dump-raw-trace=:: Display verbose dump of the sched data. +OPTIONS for 'perf sched map' +---------------------------- + +--compact:: + Show only CPUs with activity. Helps visualizing on high core + count systems. + +--cpus:: + Show just entries with activities for the given CPUs. + +--color-cpus:: + Highlight the given cpus. + +--color-pids:: + Highlight the given pids. + SEE ALSO -------- linkperf:perf-record[1] diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 382ddfb45d1d..a856a1095893 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -259,9 +259,23 @@ include::itrace.txt[] --full-source-path:: Show the full path for source files for srcline output. +--max-stack:: + Set the stack depth limit when parsing the callchain, anything + beyond the specified depth will be ignored. This is a trade-off + between information loss and faster processing especially for + workloads that can have a very long callchain stack. + Note that when using the --itrace option the synthesized callchain size + will override this value if the synthesized callchain size is bigger. + + Default: /proc/sys/kernel/perf_event_max_stack when present, 127 otherwise. + --ns:: Use 9 decimal places when displaying time (i.e. show the nanoseconds) +-f:: +--force:: + Don't do ownership validation. + SEE ALSO -------- linkperf:perf-record[1], linkperf:perf-script-perl[1], diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt index 19f046f027cd..91d638df3a6b 100644 --- a/tools/perf/Documentation/perf-top.txt +++ b/tools/perf/Documentation/perf-top.txt @@ -177,7 +177,7 @@ Default is to monitor all CPUS. between information loss and faster processing especially for workloads that can have a very long callchain stack. - Default: 127 + Default: /proc/sys/kernel/perf_event_max_stack when present, 127 otherwise. --ignore-callees=<regex>:: Ignore callees of the function(s) matching the given regex. diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt index 13293de8869f..6afe20121bc0 100644 --- a/tools/perf/Documentation/perf-trace.txt +++ b/tools/perf/Documentation/perf-trace.txt @@ -117,9 +117,41 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs. --syscalls:: Trace system calls. This options is enabled by default. +--call-graph [mode,type,min[,limit],order[,key][,branch]]:: + Setup and enable call-graph (stack chain/backtrace) recording. + See `--call-graph` section in perf-record and perf-report + man pages for details. The ones that are most useful in 'perf trace' + are 'dwarf' and 'lbr', where available, try: 'perf trace --call-graph dwarf'. + + Using this will, for the root user, bump the value of --mmap-pages to 4 + times the maximum for non-root users, based on the kernel.perf_event_mlock_kb + sysctl. This is done only if the user doesn't specify a --mmap-pages value. + +--kernel-syscall-graph:: + Show the kernel callchains on the syscall exit path. + --event:: Trace other events, see 'perf list' for a complete list. +--max-stack:: + Set the stack depth limit when parsing the callchain, anything + beyond the specified depth will be ignored. Note that at this point + this is just about the presentation part, i.e. the kernel is still + not limiting, the overhead of callchains needs to be set via the + knobs in --call-graph dwarf. + + Implies '--call-graph dwarf' when --call-graph not present on the + command line, on systems where DWARF unwinding was built in. + + Default: /proc/sys/kernel/perf_event_max_stack when present, 127 otherwise. + +--min-stack:: + Set the stack depth limit when parsing the callchain, anything + below the specified depth will be ignored. Disabled by default. + + Implies '--call-graph dwarf' when --call-graph not present on the + command line, on systems where DWARF unwinding was built in. + --proc-map-timeout:: When processing pre-existing threads /proc/XXX/mmap, it may take a long time, because the file may be huge. A time out is needed in such cases. diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 000ea210389d..bde8cbae7dd9 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -183,6 +183,11 @@ endif include config/Makefile endif +ifeq ($(config),0) +include $(srctree)/tools/scripts/Makefile.arch +-include arch/$(ARCH)/Makefile +endif + # The FEATURE_DUMP_EXPORT holds location of the actual # FEATURE_DUMP file to be used to bypass feature detection # (for bpf or any other subproject) @@ -297,8 +302,6 @@ endif # because maintaining the nesting to match is a pain. If # we had "elif" things would have been much nicer... --include arch/$(ARCH)/Makefile - ifneq ($(OUTPUT),) CFLAGS += -I$(OUTPUT) endif @@ -390,7 +393,7 @@ endif __build-dir = $(subst $(OUTPUT),,$(dir $@)) build-dir = $(if $(__build-dir),$(__build-dir),.) -prepare: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h fixdep +prepare: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h fixdep archheaders $(OUTPUT)%.o: %.c prepare FORCE $(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=$(build-dir) $@ @@ -430,7 +433,7 @@ $(patsubst perf-%,%.o,$(PROGRAMS)): $(wildcard */*.h) LIBPERF_IN := $(OUTPUT)libperf-in.o -$(LIBPERF_IN): fixdep FORCE +$(LIBPERF_IN): prepare fixdep FORCE $(Q)$(MAKE) $(build)=libperf $(LIB_FILE): $(LIBPERF_IN) @@ -625,7 +628,7 @@ config-clean: $(call QUIET_CLEAN, config) $(Q)$(MAKE) -C $(srctree)/tools/build/feature/ $(if $(OUTPUT),OUTPUT=$(OUTPUT)feature/,) clean >/dev/null -clean: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean config-clean +clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean config-clean $(call QUIET_CLEAN, core-objs) $(RM) $(LIB_FILE) $(OUTPUT)perf-archive $(OUTPUT)perf-with-kcore $(LANG_BINDINGS) $(Q)find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete $(Q)$(RM) $(OUTPUT).config-detected @@ -662,5 +665,5 @@ FORCE: .PHONY: all install clean config-clean strip install-gtk .PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell .PHONY: $(GIT-HEAD-PHONY) TAGS tags cscope FORCE prepare -.PHONY: libtraceevent_plugins +.PHONY: libtraceevent_plugins archheaders diff --git a/tools/perf/arch/powerpc/Makefile b/tools/perf/arch/powerpc/Makefile index 56e05f126ad8..cc3930904d68 100644 --- a/tools/perf/arch/powerpc/Makefile +++ b/tools/perf/arch/powerpc/Makefile @@ -3,4 +3,5 @@ PERF_HAVE_DWARF_REGS := 1 endif HAVE_KVM_STAT_SUPPORT := 1 +PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1 PERF_HAVE_JITDUMP := 1 diff --git a/tools/perf/arch/powerpc/util/dwarf-regs.c b/tools/perf/arch/powerpc/util/dwarf-regs.c index 733151cdf46e..41bdf9530d82 100644 --- a/tools/perf/arch/powerpc/util/dwarf-regs.c +++ b/tools/perf/arch/powerpc/util/dwarf-regs.c @@ -10,19 +10,26 @@ */ #include <stddef.h> +#include <errno.h> +#include <string.h> #include <dwarf-regs.h> - +#include <linux/ptrace.h> +#include <linux/kernel.h> +#include "util.h" struct pt_regs_dwarfnum { const char *name; unsigned int dwarfnum; + unsigned int ptregs_offset; }; -#define STR(s) #s -#define REG_DWARFNUM_NAME(r, num) {.name = r, .dwarfnum = num} -#define GPR_DWARFNUM_NAME(num) \ - {.name = STR(%gpr##num), .dwarfnum = num} -#define REG_DWARFNUM_END {.name = NULL, .dwarfnum = 0} +#define REG_DWARFNUM_NAME(r, num) \ + {.name = STR(%)STR(r), .dwarfnum = num, \ + .ptregs_offset = offsetof(struct pt_regs, r)} +#define GPR_DWARFNUM_NAME(num) \ + {.name = STR(%gpr##num), .dwarfnum = num, \ + .ptregs_offset = offsetof(struct pt_regs, gpr[num])} +#define REG_DWARFNUM_END {.name = NULL, .dwarfnum = 0, .ptregs_offset = 0} /* * Reference: @@ -61,12 +68,12 @@ static const struct pt_regs_dwarfnum regdwarfnum_table[] = { GPR_DWARFNUM_NAME(29), GPR_DWARFNUM_NAME(30), GPR_DWARFNUM_NAME(31), - REG_DWARFNUM_NAME("%msr", 66), - REG_DWARFNUM_NAME("%ctr", 109), - REG_DWARFNUM_NAME("%link", 108), - REG_DWARFNUM_NAME("%xer", 101), - REG_DWARFNUM_NAME("%dar", 119), - REG_DWARFNUM_NAME("%dsisr", 118), + REG_DWARFNUM_NAME(msr, 66), + REG_DWARFNUM_NAME(ctr, 109), + REG_DWARFNUM_NAME(link, 108), + REG_DWARFNUM_NAME(xer, 101), + REG_DWARFNUM_NAME(dar, 119), + REG_DWARFNUM_NAME(dsisr, 118), REG_DWARFNUM_END, }; @@ -86,3 +93,12 @@ const char *get_arch_regstr(unsigned int n) return roff->name; return NULL; } + +int regs_query_register_offset(const char *name) +{ + const struct pt_regs_dwarfnum *roff; + for (roff = regdwarfnum_table; roff->name != NULL; roff++) + if (!strcmp(roff->name, name)) + return roff->ptregs_offset; + return -EINVAL; +} diff --git a/tools/perf/arch/powerpc/util/sym-handling.c b/tools/perf/arch/powerpc/util/sym-handling.c index bbc1a50768dd..c6d0f91731a1 100644 --- a/tools/perf/arch/powerpc/util/sym-handling.c +++ b/tools/perf/arch/powerpc/util/sym-handling.c @@ -19,12 +19,6 @@ bool elf__needs_adjust_symbols(GElf_Ehdr ehdr) ehdr.e_type == ET_DYN; } -#if defined(_CALL_ELF) && _CALL_ELF == 2 -void arch__elf_sym_adjust(GElf_Sym *sym) -{ - sym->st_value += PPC64_LOCAL_ENTRY_OFFSET(sym->st_other); -} -#endif #endif #if !defined(_CALL_ELF) || _CALL_ELF != 2 @@ -65,18 +59,45 @@ bool arch__prefers_symtab(void) return true; } +#ifdef HAVE_LIBELF_SUPPORT +void arch__sym_update(struct symbol *s, GElf_Sym *sym) +{ + s->arch_sym = sym->st_other; +} +#endif + #define PPC64LE_LEP_OFFSET 8 void arch__fix_tev_from_maps(struct perf_probe_event *pev, - struct probe_trace_event *tev, struct map *map) + struct probe_trace_event *tev, struct map *map, + struct symbol *sym) { + int lep_offset; + /* - * ppc64 ABIv2 local entry point is currently always 2 instructions - * (8 bytes) after the global entry point. + * When probing at a function entry point, we normally always want the + * LEP since that catches calls to the function through both the GEP and + * the LEP. Hence, we would like to probe at an offset of 8 bytes if + * the user only specified the function entry. + * + * However, if the user specifies an offset, we fall back to using the + * GEP since all userspace applications (objdump/readelf) show function + * disassembly with offsets from the GEP. + * + * In addition, we shouldn't specify an offset for kretprobes. */ - if (!pev->uprobes && map->dso->symtab_type == DSO_BINARY_TYPE__KALLSYMS) { - tev->point.address += PPC64LE_LEP_OFFSET; + if (pev->point.offset || pev->point.retprobe || !map || !sym) + return; + + lep_offset = PPC64_LOCAL_ENTRY_OFFSET(sym->arch_sym); + + if (map->dso->symtab_type == DSO_BINARY_TYPE__KALLSYMS) tev->point.offset += PPC64LE_LEP_OFFSET; + else if (lep_offset) { + if (pev->uprobes) + tev->point.address += lep_offset; + else + tev->point.offset += lep_offset; } } #endif diff --git a/tools/perf/arch/x86/Makefile b/tools/perf/arch/x86/Makefile index 269af2143735..6c9211b18ec0 100644 --- a/tools/perf/arch/x86/Makefile +++ b/tools/perf/arch/x86/Makefile @@ -4,3 +4,26 @@ endif HAVE_KVM_STAT_SUPPORT := 1 PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1 PERF_HAVE_JITDUMP := 1 + +### +# Syscall table generation +# + +out := $(OUTPUT)arch/x86/include/generated/asm +header := $(out)/syscalls_64.c +sys := $(srctree)/tools/perf/arch/x86/entry/syscalls +systbl := $(sys)/syscalltbl.sh + +# Create output directory if not already present +_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') + +$(header): $(sys)/syscall_64.tbl $(systbl) + @(test -d ../../kernel -a -d ../../tools -a -d ../perf && ( \ + (diff -B arch/x86/entry/syscalls/syscall_64.tbl ../../arch/x86/entry/syscalls/syscall_64.tbl >/dev/null) \ + || echo "Warning: x86_64's syscall_64.tbl differs from kernel" >&2 )) || true + $(Q)$(SHELL) '$(systbl)' $(sys)/syscall_64.tbl 'x86_64' > $@ + +clean:: + $(call QUIET_CLEAN, x86) $(RM) $(header) + +archheaders: $(header) diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl new file mode 100644 index 000000000000..cac6d17ce5db --- /dev/null +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -0,0 +1,376 @@ +# +# 64-bit system call numbers and entry vectors +# +# The format is: +# <number> <abi> <name> <entry point> +# +# The abi is "common", "64" or "x32" for this file. +# +0 common read sys_read +1 common write sys_write +2 common open sys_open +3 common close sys_close +4 common stat sys_newstat +5 common fstat sys_newfstat +6 common lstat sys_newlstat +7 common poll sys_poll +8 common lseek sys_lseek +9 common mmap sys_mmap +10 common mprotect sys_mprotect +11 common munmap sys_munmap +12 common brk sys_brk +13 64 rt_sigaction sys_rt_sigaction +14 common rt_sigprocmask sys_rt_sigprocmask +15 64 rt_sigreturn sys_rt_sigreturn/ptregs +16 64 ioctl sys_ioctl +17 common pread64 sys_pread64 +18 common pwrite64 sys_pwrite64 +19 64 readv sys_readv +20 64 writev sys_writev +21 common access sys_access +22 common pipe sys_pipe +23 common select sys_select +24 common sched_yield sys_sched_yield +25 common mremap sys_mremap +26 common msync sys_msync +27 common mincore sys_mincore +28 common madvise sys_madvise +29 common shmget sys_shmget +30 common shmat sys_shmat +31 common shmctl sys_shmctl +32 common dup sys_dup +33 common dup2 sys_dup2 +34 common pause sys_pause +35 common nanosleep sys_nanosleep +36 common getitimer sys_getitimer +37 common alarm sys_alarm +38 common setitimer sys_setitimer +39 common getpid sys_getpid +40 common sendfile sys_sendfile64 +41 common socket sys_socket +42 common connect sys_connect +43 common accept sys_accept +44 common sendto sys_sendto +45 64 recvfrom sys_recvfrom +46 64 sendmsg sys_sendmsg +47 64 recvmsg sys_recvmsg +48 common shutdown sys_shutdown +49 common bind sys_bind +50 common listen sys_listen +51 common getsockname sys_getsockname +52 common getpeername sys_getpeername +53 common socketpair sys_socketpair +54 64 setsockopt sys_setsockopt +55 64 getsockopt sys_getsockopt +56 common clone sys_clone/ptregs +57 common fork sys_fork/ptregs +58 common vfork sys_vfork/ptregs +59 64 execve sys_execve/ptregs +60 common exit sys_exit +61 common wait4 sys_wait4 +62 common kill sys_kill +63 common uname sys_newuname +64 common semget sys_semget +65 common semop sys_semop +66 common semctl sys_semctl +67 common shmdt sys_shmdt +68 common msgget sys_msgget +69 common msgsnd sys_msgsnd +70 common msgrcv sys_msgrcv +71 common msgctl sys_msgctl +72 common fcntl sys_fcntl +73 common flock sys_flock +74 common fsync sys_fsync +75 common fdatasync sys_fdatasync +76 common truncate sys_truncate +77 common ftruncate sys_ftruncate +78 common getdents sys_getdents +79 common getcwd sys_getcwd +80 common chdir sys_chdir +81 common fchdir sys_fchdir +82 common rename sys_rename +83 common mkdir sys_mkdir +84 common rmdir sys_rmdir +85 common creat sys_creat +86 common link sys_link +87 common unlink sys_unlink +88 common symlink sys_symlink +89 common readlink sys_readlink +90 common chmod sys_chmod +91 common fchmod sys_fchmod +92 common chown sys_chown +93 common fchown sys_fchown +94 common lchown sys_lchown +95 common umask sys_umask +96 common gettimeofday sys_gettimeofday +97 common getrlimit sys_getrlimit +98 common getrusage sys_getrusage +99 common sysinfo sys_sysinfo +100 common times sys_times +101 64 ptrace sys_ptrace +102 common getuid sys_getuid +103 common syslog sys_syslog +104 common getgid sys_getgid +105 common setuid sys_setuid +106 common setgid sys_setgid +107 common geteuid sys_geteuid +108 common getegid sys_getegid +109 common setpgid sys_setpgid +110 common getppid sys_getppid +111 common getpgrp sys_getpgrp +112 common setsid sys_setsid +113 common setreuid sys_setreuid +114 common setregid sys_setregid +115 common getgroups sys_getgroups +116 common setgroups sys_setgroups +117 common setresuid sys_setresuid +118 common getresuid sys_getresuid +119 common setresgid sys_setresgid +120 common getresgid sys_getresgid +121 common getpgid sys_getpgid +122 common setfsuid sys_setfsuid +123 common setfsgid sys_setfsgid +124 common getsid sys_getsid +125 common capget sys_capget +126 common capset sys_capset +127 64 rt_sigpending sys_rt_sigpending +128 64 rt_sigtimedwait sys_rt_sigtimedwait +129 64 rt_sigqueueinfo sys_rt_sigqueueinfo +130 common rt_sigsuspend sys_rt_sigsuspend +131 64 sigaltstack sys_sigaltstack +132 common utime sys_utime +133 common mknod sys_mknod +134 64 uselib +135 common personality sys_personality +136 common ustat sys_ustat +137 common statfs sys_statfs +138 common fstatfs sys_fstatfs +139 common sysfs sys_sysfs +140 common getpriority sys_getpriority +141 common setpriority sys_setpriority +142 common sched_setparam sys_sched_setparam +143 common sched_getparam sys_sched_getparam +144 common sched_setscheduler sys_sched_setscheduler +145 common sched_getscheduler sys_sched_getscheduler +146 common sched_get_priority_max sys_sched_get_priority_max +147 common sched_get_priority_min sys_sched_get_priority_min +148 common sched_rr_get_interval sys_sched_rr_get_interval +149 common mlock sys_mlock +150 common munlock sys_munlock +151 common mlockall sys_mlockall +152 common munlockall sys_munlockall +153 common vhangup sys_vhangup +154 common modify_ldt sys_modify_ldt +155 common pivot_root sys_pivot_root +156 64 _sysctl sys_sysctl +157 common prctl sys_prctl +158 common arch_prctl sys_arch_prctl +159 common adjtimex sys_adjtimex +160 common setrlimit sys_setrlimit +161 common chroot sys_chroot +162 common sync sys_sync +163 common acct sys_acct +164 common settimeofday sys_settimeofday +165 common mount sys_mount +166 common umount2 sys_umount +167 common swapon sys_swapon +168 common swapoff sys_swapoff +169 common reboot sys_reboot +170 common sethostname sys_sethostname +171 common setdomainname sys_setdomainname +172 common iopl sys_iopl/ptregs +173 common ioperm sys_ioperm +174 64 create_module +175 common init_module sys_init_module +176 common delete_module sys_delete_module +177 64 get_kernel_syms +178 64 query_module +179 common quotactl sys_quotactl +180 64 nfsservctl +181 common getpmsg +182 common putpmsg +183 common afs_syscall +184 common tuxcall +185 common security +186 common gettid sys_gettid +187 common readahead sys_readahead +188 common setxattr sys_setxattr +189 common lsetxattr sys_lsetxattr +190 common fsetxattr sys_fsetxattr +191 common getxattr sys_getxattr +192 common lgetxattr sys_lgetxattr +193 common fgetxattr sys_fgetxattr +194 common listxattr sys_listxattr +195 common llistxattr sys_llistxattr +196 common flistxattr sys_flistxattr +197 common removexattr sys_removexattr +198 common lremovexattr sys_lremovexattr +199 common fremovexattr sys_fremovexattr +200 common tkill sys_tkill +201 common time sys_time +202 common futex sys_futex +203 common sched_setaffinity sys_sched_setaffinity +204 common sched_getaffinity sys_sched_getaffinity +205 64 set_thread_area +206 64 io_setup sys_io_setup +207 common io_destroy sys_io_destroy +208 common io_getevents sys_io_getevents +209 64 io_submit sys_io_submit +210 common io_cancel sys_io_cancel +211 64 get_thread_area +212 common lookup_dcookie sys_lookup_dcookie +213 common epoll_create sys_epoll_create +214 64 epoll_ctl_old +215 64 epoll_wait_old +216 common remap_file_pages sys_remap_file_pages +217 common getdents64 sys_getdents64 +218 common set_tid_address sys_set_tid_address +219 common restart_syscall sys_restart_syscall +220 common semtimedop sys_semtimedop +221 common fadvise64 sys_fadvise64 +222 64 timer_create sys_timer_create +223 common timer_settime sys_timer_settime +224 common timer_gettime sys_timer_gettime +225 common timer_getoverrun sys_timer_getoverrun +226 common timer_delete sys_timer_delete +227 common clock_settime sys_clock_settime +228 common clock_gettime sys_clock_gettime +229 common clock_getres sys_clock_getres +230 common clock_nanosleep sys_clock_nanosleep +231 common exit_group sys_exit_group +232 common epoll_wait sys_epoll_wait +233 common epoll_ctl sys_epoll_ctl +234 common tgkill sys_tgkill +235 common utimes sys_utimes +236 64 vserver +237 common mbind sys_mbind +238 common set_mempolicy sys_set_mempolicy +239 common get_mempolicy sys_get_mempolicy +240 common mq_open sys_mq_open +241 common mq_unlink sys_mq_unlink +242 common mq_timedsend sys_mq_timedsend +243 common mq_timedreceive sys_mq_timedreceive +244 64 mq_notify sys_mq_notify +245 common mq_getsetattr sys_mq_getsetattr +246 64 kexec_load sys_kexec_load +247 64 waitid sys_waitid +248 common add_key sys_add_key +249 common request_key sys_request_key +250 common keyctl sys_keyctl +251 common ioprio_set sys_ioprio_set +252 common ioprio_get sys_ioprio_get +253 common inotify_init sys_inotify_init +254 common inotify_add_watch sys_inotify_add_watch +255 common inotify_rm_watch sys_inotify_rm_watch +256 common migrate_pages sys_migrate_pages +257 common openat sys_openat +258 common mkdirat sys_mkdirat +259 common mknodat sys_mknodat +260 common fchownat sys_fchownat +261 common futimesat sys_futimesat +262 common newfstatat sys_newfstatat +263 common unlinkat sys_unlinkat +264 common renameat sys_renameat +265 common linkat sys_linkat +266 common symlinkat sys_symlinkat +267 common readlinkat sys_readlinkat +268 common fchmodat sys_fchmodat +269 common faccessat sys_faccessat +270 common pselect6 sys_pselect6 +271 common ppoll sys_ppoll +272 common unshare sys_unshare +273 64 set_robust_list sys_set_robust_list +274 64 get_robust_list sys_get_robust_list +275 common splice sys_splice +276 common tee sys_tee +277 common sync_file_range sys_sync_file_range +278 64 vmsplice sys_vmsplice +279 64 move_pages sys_move_pages +280 common utimensat sys_utimensat +281 common epoll_pwait sys_epoll_pwait +282 common signalfd sys_signalfd +283 common timerfd_create sys_timerfd_create +284 common eventfd sys_eventfd +285 common fallocate sys_fallocate +286 common timerfd_settime sys_timerfd_settime +287 common timerfd_gettime sys_timerfd_gettime +288 common accept4 sys_accept4 +289 common signalfd4 sys_signalfd4 +290 common eventfd2 sys_eventfd2 +291 common epoll_create1 sys_epoll_create1 +292 common dup3 sys_dup3 +293 common pipe2 sys_pipe2 +294 common inotify_init1 sys_inotify_init1 +295 64 preadv sys_preadv +296 64 pwritev sys_pwritev +297 64 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo +298 common perf_event_open sys_perf_event_open +299 64 recvmmsg sys_recvmmsg +300 common fanotify_init sys_fanotify_init +301 common fanotify_mark sys_fanotify_mark +302 common prlimit64 sys_prlimit64 +303 common name_to_handle_at sys_name_to_handle_at +304 common open_by_handle_at sys_open_by_handle_at +305 common clock_adjtime sys_clock_adjtime +306 common syncfs sys_syncfs +307 64 sendmmsg sys_sendmmsg +308 common setns sys_setns +309 common getcpu sys_getcpu +310 64 process_vm_readv sys_process_vm_readv +311 64 process_vm_writev sys_process_vm_writev +312 common kcmp sys_kcmp +313 common finit_module sys_finit_module +314 common sched_setattr sys_sched_setattr +315 common sched_getattr sys_sched_getattr +316 common renameat2 sys_renameat2 +317 common seccomp sys_seccomp +318 common getrandom sys_getrandom +319 common memfd_create sys_memfd_create +320 common kexec_file_load sys_kexec_file_load +321 common bpf sys_bpf +322 64 execveat sys_execveat/ptregs +323 common userfaultfd sys_userfaultfd +324 common membarrier sys_membarrier +325 common mlock2 sys_mlock2 +326 common copy_file_range sys_copy_file_range +327 64 preadv2 sys_preadv2 +328 64 pwritev2 sys_pwritev2 + +# +# x32-specific system call numbers start at 512 to avoid cache impact +# for native 64-bit operation. +# +512 x32 rt_sigaction compat_sys_rt_sigaction +513 x32 rt_sigreturn sys32_x32_rt_sigreturn +514 x32 ioctl compat_sys_ioctl +515 x32 readv compat_sys_readv +516 x32 writev compat_sys_writev +517 x32 recvfrom compat_sys_recvfrom +518 x32 sendmsg compat_sys_sendmsg +519 x32 recvmsg compat_sys_recvmsg +520 x32 execve compat_sys_execve/ptregs +521 x32 ptrace compat_sys_ptrace +522 x32 rt_sigpending compat_sys_rt_sigpending +523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait +524 x32 rt_sigqueueinfo compat_sys_rt_sigqueueinfo +525 x32 sigaltstack compat_sys_sigaltstack +526 x32 timer_create compat_sys_timer_create +527 x32 mq_notify compat_sys_mq_notify +528 x32 kexec_load compat_sys_kexec_load +529 x32 waitid compat_sys_waitid +530 x32 set_robust_list compat_sys_set_robust_list +531 x32 get_robust_list compat_sys_get_robust_list +532 x32 vmsplice compat_sys_vmsplice +533 x32 move_pages compat_sys_move_pages +534 x32 preadv compat_sys_preadv64 +535 x32 pwritev compat_sys_pwritev64 +536 x32 rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo +537 x32 recvmmsg compat_sys_recvmmsg +538 x32 sendmmsg compat_sys_sendmmsg +539 x32 process_vm_readv compat_sys_process_vm_readv +540 x32 process_vm_writev compat_sys_process_vm_writev +541 x32 setsockopt compat_sys_setsockopt +542 x32 getsockopt compat_sys_getsockopt +543 x32 io_setup compat_sys_io_setup +544 x32 io_submit compat_sys_io_submit +545 x32 execveat compat_sys_execveat/ptregs diff --git a/tools/perf/arch/x86/entry/syscalls/syscalltbl.sh b/tools/perf/arch/x86/entry/syscalls/syscalltbl.sh new file mode 100755 index 000000000000..49a18b9ad9cf --- /dev/null +++ b/tools/perf/arch/x86/entry/syscalls/syscalltbl.sh @@ -0,0 +1,39 @@ +#!/bin/sh + +in="$1" +arch="$2" + +syscall_macro() { + nr="$1" + name="$2" + + echo " [$nr] = \"$name\"," +} + +emit() { + nr="$1" + entry="$2" + + syscall_macro "$nr" "$entry" +} + +echo "static const char *syscalltbl_${arch}[] = {" + +sorted_table=$(mktemp /tmp/syscalltbl.XXXXXX) +grep '^[0-9]' "$in" | sort -n > $sorted_table + +max_nr=0 +while read nr abi name entry compat; do + if [ $nr -ge 512 ] ; then # discard compat sycalls + break + fi + + emit "$nr" "$name" + max_nr=$nr +done < $sorted_table + +rm -f $sorted_table + +echo "};" + +echo "#define SYSCALLTBL_${arch}_MAX_ID ${max_nr}" diff --git a/tools/perf/arch/x86/tests/perf-time-to-tsc.c b/tools/perf/arch/x86/tests/perf-time-to-tsc.c index 9d29ee283ac5..d4aa567a29c4 100644 --- a/tools/perf/arch/x86/tests/perf-time-to-tsc.c +++ b/tools/perf/arch/x86/tests/perf-time-to-tsc.c @@ -71,7 +71,7 @@ int test__perf_time_to_tsc(int subtest __maybe_unused) CHECK__(parse_events(evlist, "cycles:u", NULL)); - perf_evlist__config(evlist, &opts); + perf_evlist__config(evlist, &opts, NULL); evsel = perf_evlist__first(evlist); diff --git a/tools/perf/arch/x86/util/dwarf-regs.c b/tools/perf/arch/x86/util/dwarf-regs.c index 9223c164e545..1f86ee8fb831 100644 --- a/tools/perf/arch/x86/util/dwarf-regs.c +++ b/tools/perf/arch/x86/util/dwarf-regs.c @@ -63,6 +63,8 @@ struct pt_regs_offset { # define REG_OFFSET_NAME_32(n, r) {.name = n, .offset = offsetof(struct pt_regs, r)} #endif +/* TODO: switching by dwarf address size */ +#ifndef __x86_64__ static const struct pt_regs_offset x86_32_regoffset_table[] = { REG_OFFSET_NAME_32("%ax", eax), REG_OFFSET_NAME_32("%cx", ecx), @@ -75,6 +77,8 @@ static const struct pt_regs_offset x86_32_regoffset_table[] = { REG_OFFSET_END, }; +#define regoffset_table x86_32_regoffset_table +#else static const struct pt_regs_offset x86_64_regoffset_table[] = { REG_OFFSET_NAME_64("%ax", rax), REG_OFFSET_NAME_64("%dx", rdx), @@ -95,11 +99,7 @@ static const struct pt_regs_offset x86_64_regoffset_table[] = { REG_OFFSET_END, }; -/* TODO: switching by dwarf address size */ -#ifdef __x86_64__ #define regoffset_table x86_64_regoffset_table -#else -#define regoffset_table x86_32_regoffset_table #endif /* Minus 1 for the ending REG_OFFSET_END */ diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c index d66f9ad4df2e..7dc30637cf66 100644 --- a/tools/perf/arch/x86/util/intel-bts.c +++ b/tools/perf/arch/x86/util/intel-bts.c @@ -438,6 +438,11 @@ struct auxtrace_record *intel_bts_recording_init(int *err) if (!intel_bts_pmu) return NULL; + if (setenv("JITDUMP_USE_ARCH_TIMESTAMP", "1", 1)) { + *err = -errno; + return NULL; + } + btsr = zalloc(sizeof(struct intel_bts_recording)); if (!btsr) { *err = -ENOMEM; diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index a3395179c9ee..a07b9605e93b 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -1027,6 +1027,11 @@ struct auxtrace_record *intel_pt_recording_init(int *err) if (!intel_pt_pmu) return NULL; + if (setenv("JITDUMP_USE_ARCH_TIMESTAMP", "1", 1)) { + *err = -errno; + return NULL; + } + ptr = zalloc(sizeof(struct intel_pt_recording)); if (!ptr) { *err = -ENOMEM; diff --git a/tools/perf/arch/x86/util/tsc.c b/tools/perf/arch/x86/util/tsc.c index fd2868490d00..357f1b13b5ae 100644 --- a/tools/perf/arch/x86/util/tsc.c +++ b/tools/perf/arch/x86/util/tsc.c @@ -7,7 +7,6 @@ #include <linux/types.h> #include "../../util/debug.h" #include "../../util/tsc.h" -#include "tsc.h" int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc, struct perf_tsc_conversion *tc) @@ -46,3 +45,34 @@ u64 rdtsc(void) return low | ((u64)high) << 32; } + +int perf_event__synth_time_conv(const struct perf_event_mmap_page *pc, + struct perf_tool *tool, + perf_event__handler_t process, + struct machine *machine) +{ + union perf_event event = { + .time_conv = { + .header = { + .type = PERF_RECORD_TIME_CONV, + .size = sizeof(struct time_conv_event), + }, + }, + }; + struct perf_tsc_conversion tc; + int err; + + err = perf_read_tsc_conversion(pc, &tc); + if (err == -EOPNOTSUPP) + return 0; + if (err) + return err; + + pr_debug2("Synthesizing TSC conversion information\n"); + + event.time_conv.time_mult = tc.time_mult; + event.time_conv.time_shift = tc.time_shift; + event.time_conv.time_zero = tc.time_zero; + + return process(tool, &event, NULL, machine); +} diff --git a/tools/perf/arch/x86/util/tsc.h b/tools/perf/arch/x86/util/tsc.h deleted file mode 100644 index 2edc4d31065c..000000000000 --- a/tools/perf/arch/x86/util/tsc.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef TOOLS_PERF_ARCH_X86_UTIL_TSC_H__ -#define TOOLS_PERF_ARCH_X86_UTIL_TSC_H__ - -#include <linux/types.h> - -struct perf_tsc_conversion { - u16 time_shift; - u32 time_mult; - u64 time_zero; -}; - -struct perf_event_mmap_page; - -int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc, - struct perf_tsc_conversion *tc); - -#endif /* TOOLS_PERF_ARCH_X86_UTIL_TSC_H__ */ diff --git a/tools/perf/bench/futex-lock-pi.c b/tools/perf/bench/futex-lock-pi.c index 6a18ce21f865..6952db65508a 100644 --- a/tools/perf/bench/futex-lock-pi.c +++ b/tools/perf/bench/futex-lock-pi.c @@ -83,7 +83,7 @@ static void *workerfn(void *arg) do { int ret; again: - ret = futex_lock_pi(w->futex, NULL, 0, futex_flag); + ret = futex_lock_pi(w->futex, NULL, futex_flag); if (ret) { /* handle lock acquisition */ if (!silent) diff --git a/tools/perf/bench/futex.h b/tools/perf/bench/futex.h index d44de9f44281..b2e06d1190d0 100644 --- a/tools/perf/bench/futex.h +++ b/tools/perf/bench/futex.h @@ -57,13 +57,11 @@ futex_wake(u_int32_t *uaddr, int nr_wake, int opflags) /** * futex_lock_pi() - block on uaddr as a PI mutex - * @detect: whether (1) or not (0) to perform deadlock detection */ static inline int -futex_lock_pi(u_int32_t *uaddr, struct timespec *timeout, int detect, - int opflags) +futex_lock_pi(u_int32_t *uaddr, struct timespec *timeout, int opflags) { - return futex(uaddr, FUTEX_LOCK_PI, detect, timeout, NULL, 0, opflags); + return futex(uaddr, FUTEX_LOCK_PI, 0, timeout, NULL, 0, opflags); } /** diff --git a/tools/perf/bench/mem-functions.c b/tools/perf/bench/mem-functions.c index a91aa85d80ff..2b54d0f2672a 100644 --- a/tools/perf/bench/mem-functions.c +++ b/tools/perf/bench/mem-functions.c @@ -6,6 +6,7 @@ * Written by Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp> */ +#include "debug.h" #include "../perf.h" #include "../util/util.h" #include <subcmd/parse-options.h> @@ -63,14 +64,16 @@ static struct perf_event_attr cycle_attr = { .config = PERF_COUNT_HW_CPU_CYCLES }; -static void init_cycles(void) +static int init_cycles(void) { cycles_fd = sys_perf_event_open(&cycle_attr, getpid(), -1, -1, perf_event_open_cloexec_flag()); - if (cycles_fd < 0 && errno == ENOSYS) - die("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); - else - BUG_ON(cycles_fd < 0); + if (cycles_fd < 0 && errno == ENOSYS) { + pr_debug("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); + return -1; + } + + return cycles_fd; } static u64 get_cycles(void) @@ -155,8 +158,13 @@ static int bench_mem_common(int argc, const char **argv, struct bench_mem_info * argc = parse_options(argc, argv, options, info->usage, 0); - if (use_cycles) - init_cycles(); + if (use_cycles) { + i = init_cycles(); + if (i < 0) { + fprintf(stderr, "Failed to open cycles counter\n"); + return i; + } + } size = (size_t)perf_atoll((char *)size_str); size_total = (double)size * nr_loops; diff --git a/tools/perf/builtin-config.c b/tools/perf/builtin-config.c index c42448ed5dfe..fe1b77fa21f9 100644 --- a/tools/perf/builtin-config.c +++ b/tools/perf/builtin-config.c @@ -12,6 +12,7 @@ #include <subcmd/parse-options.h> #include "util/util.h" #include "util/debug.h" +#include "util/config.h" static bool use_system_config, use_user_config; @@ -32,13 +33,28 @@ static struct option config_options[] = { OPT_END() }; -static int show_config(const char *key, const char *value, - void *cb __maybe_unused) +static int show_config(struct perf_config_set *set) { - if (value) - printf("%s=%s\n", key, value); - else - printf("%s\n", key); + struct perf_config_section *section; + struct perf_config_item *item; + struct list_head *sections; + + if (set == NULL) + return -1; + + sections = &set->sections; + if (list_empty(sections)) + return -1; + + list_for_each_entry(section, sections, node) { + list_for_each_entry(item, §ion->items, node) { + char *value = item->value; + + if (value) + printf("%s.%s=%s\n", section->name, + item->name, value); + } + } return 0; } @@ -46,6 +62,7 @@ static int show_config(const char *key, const char *value, int cmd_config(int argc, const char **argv, const char *prefix __maybe_unused) { int ret = 0; + struct perf_config_set *set; char *user_config = mkpath("%s/.perfconfig", getenv("HOME")); argc = parse_options(argc, argv, config_options, config_usage, @@ -63,13 +80,19 @@ int cmd_config(int argc, const char **argv, const char *prefix __maybe_unused) else if (use_user_config) config_exclusive_filename = user_config; + set = perf_config_set__new(); + if (!set) { + ret = -1; + goto out_err; + } + switch (actions) { case ACTION_LIST: if (argc) { pr_err("Error: takes no arguments\n"); parse_options_usage(config_usage, config_options, "l", 1); } else { - ret = perf_config(show_config, NULL); + ret = show_config(set); if (ret < 0) { const char * config_filename = config_exclusive_filename; if (!config_exclusive_filename) @@ -83,5 +106,7 @@ int cmd_config(int argc, const char **argv, const char *prefix __maybe_unused) usage_with_options(config_usage, config_options); } + perf_config_set__delete(set); +out_err: return ret; } diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index 8053a8ceefda..9ce354f469dc 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -428,7 +428,7 @@ static void hists__baseline_only(struct hists *hists) struct rb_root *root; struct rb_node *next; - if (sort__need_collapse) + if (hists__has(hists, need_collapse)) root = &hists->entries_collapsed; else root = hists->entries_in; @@ -450,7 +450,7 @@ static void hists__precompute(struct hists *hists) struct rb_root *root; struct rb_node *next; - if (sort__need_collapse) + if (hists__has(hists, need_collapse)) root = &hists->entries_collapsed; else root = hists->entries_in; diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c index bc1de9b8fd67..f9830c902b78 100644 --- a/tools/perf/builtin-help.c +++ b/tools/perf/builtin-help.c @@ -61,6 +61,7 @@ static int check_emacsclient_version(void) struct child_process ec_process; const char *argv_ec[] = { "emacsclient", "--version", NULL }; int version; + int ret = -1; /* emacsclient prints its version number on stderr */ memset(&ec_process, 0, sizeof(ec_process)); @@ -71,7 +72,10 @@ static int check_emacsclient_version(void) fprintf(stderr, "Failed to start emacsclient.\n"); return -1; } - strbuf_read(&buffer, ec_process.err, 20); + if (strbuf_read(&buffer, ec_process.err, 20) < 0) { + fprintf(stderr, "Failed to read emacsclient version\n"); + goto out; + } close(ec_process.err); /* @@ -82,8 +86,7 @@ static int check_emacsclient_version(void) if (prefixcmp(buffer.buf, "emacsclient")) { fprintf(stderr, "Failed to parse emacsclient version.\n"); - strbuf_release(&buffer); - return -1; + goto out; } version = atoi(buffer.buf + strlen("emacsclient")); @@ -92,12 +95,11 @@ static int check_emacsclient_version(void) fprintf(stderr, "emacsclient version '%d' too old (< 22).\n", version); - strbuf_release(&buffer); - return -1; - } - + } else + ret = 0; +out: strbuf_release(&buffer); - return 0; + return ret; } static void exec_woman_emacs(const char *path, const char *page) diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index d1a2d104f2bc..e5afa8fe1bf1 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -748,6 +748,7 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused) .auxtrace_info = perf_event__repipe_op2_synth, .auxtrace = perf_event__repipe_auxtrace, .auxtrace_error = perf_event__repipe_op2_synth, + .time_conv = perf_event__repipe_op2_synth, .finished_round = perf_event__repipe_oe_synth, .build_id = perf_event__repipe_op2_synth, .id_index = perf_event__repipe_op2_synth, diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index c9cb3be47cff..58adfee230de 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -375,7 +375,7 @@ static u64 find_callsite(struct perf_evsel *evsel, struct perf_sample *sample) } al.thread = machine__findnew_thread(machine, sample->pid, sample->tid); - sample__resolve_callchain(sample, NULL, evsel, &al, 16); + sample__resolve_callchain(sample, &callchain_cursor, NULL, evsel, &al, 16); callchain_cursor_commit(&callchain_cursor); while (true) { diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index bff666458b28..6487c06d2708 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -982,7 +982,7 @@ static int kvm_live_open_events(struct perf_kvm_stat *kvm) struct perf_evlist *evlist = kvm->evlist; char sbuf[STRERR_BUFSIZE]; - perf_evlist__config(evlist, &kvm->opts); + perf_evlist__config(evlist, &kvm->opts, NULL); /* * Note: exclude_{guest,host} do not apply here. diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index 85db3be4b3cb..1dc140c5481d 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -62,19 +62,22 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem) int rec_argc, i = 0, j; const char **rec_argv; int ret; + bool all_user = false, all_kernel = false; struct option options[] = { OPT_CALLBACK('e', "event", &mem, "event", "event selector. use 'perf mem record -e list' to list available events", parse_record_events), OPT_INCR('v', "verbose", &verbose, "be more verbose (show counter open errors, etc)"), + OPT_BOOLEAN('U', "--all-user", &all_user, "collect only user level data"), + OPT_BOOLEAN('K', "--all-kernel", &all_kernel, "collect only kernel level data"), OPT_END() }; argc = parse_options(argc, argv, options, record_mem_usage, PARSE_OPT_STOP_AT_NON_OPTION); - rec_argc = argc + 7; /* max number of arguments */ + rec_argc = argc + 9; /* max number of arguments */ rec_argv = calloc(rec_argc + 1, sizeof(char *)); if (!rec_argv) return -1; @@ -103,6 +106,12 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem) rec_argv[i++] = perf_mem_events__name(j); }; + if (all_user) + rec_argv[i++] = "--all-user"; + + if (all_kernel) + rec_argv[i++] = "--all-kernel"; + for (j = 0; j < argc; j++, i++) rec_argv[i] = argv[j]; diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 515510ecc76a..f3679c44d3f3 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -29,10 +29,12 @@ #include "util/data.h" #include "util/perf_regs.h" #include "util/auxtrace.h" +#include "util/tsc.h" #include "util/parse-branch-options.h" #include "util/parse-regs-options.h" #include "util/llvm-utils.h" #include "util/bpf-loader.h" +#include "util/trigger.h" #include "asm/bug.h" #include <unistd.h> @@ -55,6 +57,8 @@ struct record { bool no_buildid_cache; bool no_buildid_cache_set; bool buildid_all; + bool timestamp_filename; + bool switch_output; unsigned long long samples; }; @@ -124,9 +128,10 @@ out: static volatile int done; static volatile int signr = -1; static volatile int child_finished; -static volatile int auxtrace_snapshot_enabled; -static volatile int auxtrace_snapshot_err; + static volatile int auxtrace_record__snapshot_started; +static DEFINE_TRIGGER(auxtrace_snapshot_trigger); +static DEFINE_TRIGGER(switch_output_trigger); static void sig_handler(int sig) { @@ -244,11 +249,12 @@ static void record__read_auxtrace_snapshot(struct record *rec) { pr_debug("Recording AUX area tracing snapshot\n"); if (record__auxtrace_read_snapshot_all(rec) < 0) { - auxtrace_snapshot_err = -1; + trigger_error(&auxtrace_snapshot_trigger); } else { - auxtrace_snapshot_err = auxtrace_record__snapshot_finish(rec->itr); - if (!auxtrace_snapshot_err) - auxtrace_snapshot_enabled = 1; + if (auxtrace_record__snapshot_finish(rec->itr)) + trigger_error(&auxtrace_snapshot_trigger); + else + trigger_ready(&auxtrace_snapshot_trigger); } } @@ -283,7 +289,7 @@ static int record__open(struct record *rec) struct record_opts *opts = &rec->opts; int rc = 0; - perf_evlist__config(evlist, opts); + perf_evlist__config(evlist, opts, &callchain_param); evlist__for_each(evlist, pos) { try_again: @@ -494,6 +500,73 @@ record__finish_output(struct record *rec) return; } +static int record__synthesize_workload(struct record *rec) +{ + struct { + struct thread_map map; + struct thread_map_data map_data; + } thread_map; + + thread_map.map.nr = 1; + thread_map.map.map[0].pid = rec->evlist->workload.pid; + thread_map.map.map[0].comm = NULL; + return perf_event__synthesize_thread_map(&rec->tool, &thread_map.map, + process_synthesized_event, + &rec->session->machines.host, + rec->opts.sample_address, + rec->opts.proc_map_timeout); +} + +static int record__synthesize(struct record *rec); + +static int +record__switch_output(struct record *rec, bool at_exit) +{ + struct perf_data_file *file = &rec->file; + int fd, err; + + /* Same Size: "2015122520103046"*/ + char timestamp[] = "InvalidTimestamp"; + + rec->samples = 0; + record__finish_output(rec); + err = fetch_current_timestamp(timestamp, sizeof(timestamp)); + if (err) { + pr_err("Failed to get current timestamp\n"); + return -EINVAL; + } + + fd = perf_data_file__switch(file, timestamp, + rec->session->header.data_offset, + at_exit); + if (fd >= 0 && !at_exit) { + rec->bytes_written = 0; + rec->session->header.data_size = 0; + } + + if (!quiet) + fprintf(stderr, "[ perf record: Dump %s.%s ]\n", + file->path, timestamp); + + /* Output tracking events */ + if (!at_exit) { + record__synthesize(rec); + + /* + * In 'perf record --switch-output' without -a, + * record__synthesize() in record__switch_output() won't + * generate tracking events because there's no thread_map + * in evlist. Which causes newly created perf.data doesn't + * contain map and comm information. + * Create a fake thread_map and directly call + * perf_event__synthesize_thread_map() for those events. + */ + if (target__none(&rec->opts.target)) + record__synthesize_workload(rec); + } + return fd; +} + static volatile int workload_exec_errno; /* @@ -512,6 +585,15 @@ static void workload_exec_failed_signal(int signo __maybe_unused, static void snapshot_sig_handler(int sig); +int __weak +perf_event__synth_time_conv(const struct perf_event_mmap_page *pc __maybe_unused, + struct perf_tool *tool __maybe_unused, + perf_event__handler_t process __maybe_unused, + struct machine *machine __maybe_unused) +{ + return 0; +} + static int record__synthesize(struct record *rec) { struct perf_session *session = rec->session; @@ -549,6 +631,11 @@ static int record__synthesize(struct record *rec) } } + err = perf_event__synth_time_conv(rec->evlist->mmap[0].base, tool, + process_synthesized_event, machine); + if (err) + goto out; + if (rec->opts.full_auxtrace) { err = perf_event__synthesize_auxtrace_info(rec->itr, tool, session, process_synthesized_event); @@ -600,10 +687,16 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) signal(SIGCHLD, sig_handler); signal(SIGINT, sig_handler); signal(SIGTERM, sig_handler); - if (rec->opts.auxtrace_snapshot_mode) + + if (rec->opts.auxtrace_snapshot_mode || rec->switch_output) { signal(SIGUSR2, snapshot_sig_handler); - else + if (rec->opts.auxtrace_snapshot_mode) + trigger_on(&auxtrace_snapshot_trigger); + if (rec->switch_output) + trigger_on(&switch_output_trigger); + } else { signal(SIGUSR2, SIG_IGN); + } session = perf_session__new(file, false, tool); if (session == NULL) { @@ -729,27 +822,45 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) perf_evlist__enable(rec->evlist); } - auxtrace_snapshot_enabled = 1; + trigger_ready(&auxtrace_snapshot_trigger); + trigger_ready(&switch_output_trigger); for (;;) { unsigned long long hits = rec->samples; if (record__mmap_read_all(rec) < 0) { - auxtrace_snapshot_enabled = 0; + trigger_error(&auxtrace_snapshot_trigger); + trigger_error(&switch_output_trigger); err = -1; goto out_child; } if (auxtrace_record__snapshot_started) { auxtrace_record__snapshot_started = 0; - if (!auxtrace_snapshot_err) + if (!trigger_is_error(&auxtrace_snapshot_trigger)) record__read_auxtrace_snapshot(rec); - if (auxtrace_snapshot_err) { + if (trigger_is_error(&auxtrace_snapshot_trigger)) { pr_err("AUX area tracing snapshot failed\n"); err = -1; goto out_child; } } + if (trigger_is_hit(&switch_output_trigger)) { + trigger_ready(&switch_output_trigger); + + if (!quiet) + fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n", + waking); + waking = 0; + fd = record__switch_output(rec, false); + if (fd < 0) { + pr_err("Failed to switch to new file\n"); + trigger_error(&switch_output_trigger); + err = fd; + goto out_child; + } + } + if (hits == rec->samples) { if (done || draining) break; @@ -772,12 +883,13 @@ static int __cmd_record(struct record *rec, int argc, const char **argv) * disable events in this case. */ if (done && !disabled && !target__none(&opts->target)) { - auxtrace_snapshot_enabled = 0; + trigger_off(&auxtrace_snapshot_trigger); perf_evlist__disable(rec->evlist); disabled = true; } } - auxtrace_snapshot_enabled = 0; + trigger_off(&auxtrace_snapshot_trigger); + trigger_off(&switch_output_trigger); if (forks && workload_exec_errno) { char msg[STRERR_BUFSIZE]; @@ -811,11 +923,22 @@ out_child: /* this will be recalculated during process_buildids() */ rec->samples = 0; - if (!err) - record__finish_output(rec); + if (!err) { + if (!rec->timestamp_filename) { + record__finish_output(rec); + } else { + fd = record__switch_output(rec, true); + if (fd < 0) { + status = fd; + goto out_delete_session; + } + } + } if (!err && !quiet) { char samples[128]; + const char *postfix = rec->timestamp_filename ? + ".<timestamp>" : ""; if (rec->samples && !rec->opts.full_auxtrace) scnprintf(samples, sizeof(samples), @@ -823,9 +946,9 @@ out_child: else samples[0] = '\0'; - fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s ]\n", + fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s ]\n", perf_data_file__size(file) / 1024.0 / 1024.0, - file->path, samples); + file->path, postfix, samples); } out_delete_session: @@ -833,58 +956,61 @@ out_delete_session: return status; } -static void callchain_debug(void) +static void callchain_debug(struct callchain_param *callchain) { static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" }; - pr_debug("callchain: type %s\n", str[callchain_param.record_mode]); + pr_debug("callchain: type %s\n", str[callchain->record_mode]); - if (callchain_param.record_mode == CALLCHAIN_DWARF) + if (callchain->record_mode == CALLCHAIN_DWARF) pr_debug("callchain: stack dump size %d\n", - callchain_param.dump_size); + callchain->dump_size); } -int record_parse_callchain_opt(const struct option *opt, - const char *arg, - int unset) +int record_opts__parse_callchain(struct record_opts *record, + struct callchain_param *callchain, + const char *arg, bool unset) { int ret; - struct record_opts *record = (struct record_opts *)opt->value; - - record->callgraph_set = true; - callchain_param.enabled = !unset; + callchain->enabled = !unset; /* --no-call-graph */ if (unset) { - callchain_param.record_mode = CALLCHAIN_NONE; + callchain->record_mode = CALLCHAIN_NONE; pr_debug("callchain: disabled\n"); return 0; } - ret = parse_callchain_record_opt(arg, &callchain_param); + ret = parse_callchain_record_opt(arg, callchain); if (!ret) { /* Enable data address sampling for DWARF unwind. */ - if (callchain_param.record_mode == CALLCHAIN_DWARF) + if (callchain->record_mode == CALLCHAIN_DWARF) record->sample_address = true; - callchain_debug(); + callchain_debug(callchain); } return ret; } +int record_parse_callchain_opt(const struct option *opt, + const char *arg, + int unset) +{ + return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset); +} + int record_callchain_opt(const struct option *opt, const char *arg __maybe_unused, int unset __maybe_unused) { - struct record_opts *record = (struct record_opts *)opt->value; + struct callchain_param *callchain = opt->value; - record->callgraph_set = true; - callchain_param.enabled = true; + callchain->enabled = true; - if (callchain_param.record_mode == CALLCHAIN_NONE) - callchain_param.record_mode = CALLCHAIN_FP; + if (callchain->record_mode == CALLCHAIN_NONE) + callchain->record_mode = CALLCHAIN_FP; - callchain_debug(); + callchain_debug(callchain); return 0; } @@ -1122,7 +1248,7 @@ struct option __record_options[] = { record__parse_mmap_pages), OPT_BOOLEAN(0, "group", &record.opts.group, "put the counters into a counter group"), - OPT_CALLBACK_NOOPT('g', NULL, &record.opts, + OPT_CALLBACK_NOOPT('g', NULL, &callchain_param, NULL, "enables call-graph recording" , &record_callchain_opt), OPT_CALLBACK(0, "call-graph", &record.opts, @@ -1195,6 +1321,10 @@ struct option __record_options[] = { "file", "vmlinux pathname"), OPT_BOOLEAN(0, "buildid-all", &record.buildid_all, "Record build-id of all DSOs regardless of hits"), + OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename, + "append timestamp to output filename"), + OPT_BOOLEAN(0, "switch-output", &record.switch_output, + "Switch output when receive SIGUSR2"), OPT_END() }; @@ -1250,6 +1380,9 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused) return -EINVAL; } + if (rec->switch_output) + rec->timestamp_filename = true; + if (!rec->itr) { rec->itr = auxtrace_record__init(rec->evlist, &err); if (err) @@ -1261,6 +1394,14 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused) if (err) return err; + err = bpf__setup_stdout(rec->evlist); + if (err) { + bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf)); + pr_err("ERROR: Setup BPF stdout failed: %s\n", + errbuf); + return err; + } + err = -ENOMEM; symbol__init(NULL); @@ -1275,8 +1416,36 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused) "If some relocation was applied (e.g. kexec) symbols may be misresolved\n" "even with a suitable vmlinux or kallsyms file.\n\n"); - if (rec->no_buildid_cache || rec->no_buildid) + if (rec->no_buildid_cache || rec->no_buildid) { disable_buildid_cache(); + } else if (rec->switch_output) { + /* + * In 'perf record --switch-output', disable buildid + * generation by default to reduce data file switching + * overhead. Still generate buildid if they are required + * explicitly using + * + * perf record --signal-trigger --no-no-buildid \ + * --no-no-buildid-cache + * + * Following code equals to: + * + * if ((rec->no_buildid || !rec->no_buildid_set) && + * (rec->no_buildid_cache || !rec->no_buildid_cache_set)) + * disable_buildid_cache(); + */ + bool disable = true; + + if (rec->no_buildid_set && !rec->no_buildid) + disable = false; + if (rec->no_buildid_cache_set && !rec->no_buildid_cache) + disable = false; + if (disable) { + rec->no_buildid = true; + rec->no_buildid_cache = true; + disable_buildid_cache(); + } + } if (rec->evlist->nr_entries == 0 && perf_evlist__add_default(rec->evlist) < 0) { @@ -1335,9 +1504,13 @@ out_symbol_exit: static void snapshot_sig_handler(int sig __maybe_unused) { - if (!auxtrace_snapshot_enabled) - return; - auxtrace_snapshot_enabled = 0; - auxtrace_snapshot_err = auxtrace_record__snapshot_start(record.itr); - auxtrace_record__snapshot_started = 1; + if (trigger_is_ready(&auxtrace_snapshot_trigger)) { + trigger_hit(&auxtrace_snapshot_trigger); + auxtrace_record__snapshot_started = 1; + if (auxtrace_record__snapshot_start(record.itr)) + trigger_error(&auxtrace_snapshot_trigger); + } + + if (trigger_is_ready(&switch_output_trigger)) + trigger_hit(&switch_output_trigger); } diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 160ea23b45aa..87d40e3c4078 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -47,7 +47,6 @@ struct report { struct perf_tool tool; struct perf_session *session; bool use_tui, use_gtk, use_stdio; - bool dont_use_callchains; bool show_full_info; bool show_threads; bool inverted_callchain; @@ -235,7 +234,7 @@ static int report__setup_sample_type(struct report *rep) sample_type |= PERF_SAMPLE_BRANCH_STACK; if (!is_pipe && !(sample_type & PERF_SAMPLE_CALLCHAIN)) { - if (sort__has_parent) { + if (perf_hpp_list.parent) { ui__error("Selected --sort parent, but no " "callchain data. Did you call " "'perf record' without -g?\n"); @@ -247,7 +246,7 @@ static int report__setup_sample_type(struct report *rep) "you call 'perf record' without -g?\n"); return -1; } - } else if (!rep->dont_use_callchains && + } else if (!callchain_param.enabled && callchain_param.mode != CHAIN_NONE && !symbol_conf.use_callchain) { symbol_conf.use_callchain = true; @@ -599,13 +598,15 @@ static int __cmd_report(struct report *rep) static int report_parse_callchain_opt(const struct option *opt, const char *arg, int unset) { - struct report *rep = (struct report *)opt->value; + struct callchain_param *callchain = opt->value; + callchain->enabled = !unset; /* * --no-call-graph */ if (unset) { - rep->dont_use_callchains = true; + symbol_conf.use_callchain = false; + callchain->mode = CHAIN_NONE; return 0; } @@ -690,7 +691,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused) .ordered_events = true, .ordering_requires_timestamps = true, }, - .max_stack = PERF_MAX_STACK_DEPTH, + .max_stack = sysctl_perf_event_max_stack, .pretty_printing_style = "normal", .socket_filter = -1, }; @@ -734,7 +735,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused) "regex filter to identify parent, see: '--sort parent'"), OPT_BOOLEAN('x', "exclude-other", &symbol_conf.exclude_other, "Only display entries with parent-match"), - OPT_CALLBACK_DEFAULT('g', "call-graph", &report, + OPT_CALLBACK_DEFAULT('g', "call-graph", &callchain_param, "print_type,threshold[,print_limit],order,sort_key[,branch],value", report_callchain_help, &report_parse_callchain_opt, callchain_default_opt), @@ -743,7 +744,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused) OPT_INTEGER(0, "max-stack", &report.max_stack, "Set the maximum stack depth when parsing the callchain, " "anything beyond the specified depth will be ignored. " - "Default: " __stringify(PERF_MAX_STACK_DEPTH)), + "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)), OPT_BOOLEAN('G', "inverted", &report.inverted_callchain, "alias for inverted call graph"), OPT_CALLBACK(0, "ignore-callees", NULL, "regex", @@ -935,7 +936,7 @@ repeat: goto error; } - sort__need_collapse = true; + perf_hpp_list.need_collapse = true; } /* Force tty output for header output and per-thread stat. */ diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 871b55ae22a4..afa057666c2a 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -11,6 +11,8 @@ #include "util/session.h" #include "util/tool.h" #include "util/cloexec.h" +#include "util/thread_map.h" +#include "util/color.h" #include <subcmd/parse-options.h> #include "util/trace-event.h" @@ -122,6 +124,21 @@ struct trace_sched_handler { struct machine *machine); }; +#define COLOR_PIDS PERF_COLOR_BLUE +#define COLOR_CPUS PERF_COLOR_BG_RED + +struct perf_sched_map { + DECLARE_BITMAP(comp_cpus_mask, MAX_CPUS); + int *comp_cpus; + bool comp; + struct thread_map *color_pids; + const char *color_pids_str; + struct cpu_map *color_cpus; + const char *color_cpus_str; + struct cpu_map *cpus; + const char *cpus_str; +}; + struct perf_sched { struct perf_tool tool; const char *sort_order; @@ -173,6 +190,7 @@ struct perf_sched { struct list_head sort_list, cmp_pid; bool force; bool skip_merge; + struct perf_sched_map map; }; static u64 get_nsecs(void) @@ -1339,6 +1357,38 @@ static int process_sched_wakeup_event(struct perf_tool *tool, return 0; } +union map_priv { + void *ptr; + bool color; +}; + +static bool thread__has_color(struct thread *thread) +{ + union map_priv priv = { + .ptr = thread__priv(thread), + }; + + return priv.color; +} + +static struct thread* +map__findnew_thread(struct perf_sched *sched, struct machine *machine, pid_t pid, pid_t tid) +{ + struct thread *thread = machine__findnew_thread(machine, pid, tid); + union map_priv priv = { + .color = false, + }; + + if (!sched->map.color_pids || !thread || thread__priv(thread)) + return thread; + + if (thread_map__has(sched->map.color_pids, tid)) + priv.color = true; + + thread__set_priv(thread, priv.ptr); + return thread; +} + static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel, struct perf_sample *sample, struct machine *machine) { @@ -1347,13 +1397,25 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel, int new_shortname; u64 timestamp0, timestamp = sample->time; s64 delta; - int cpu, this_cpu = sample->cpu; + int i, this_cpu = sample->cpu; + int cpus_nr; + bool new_cpu = false; + const char *color = PERF_COLOR_NORMAL; BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0); if (this_cpu > sched->max_cpu) sched->max_cpu = this_cpu; + if (sched->map.comp) { + cpus_nr = bitmap_weight(sched->map.comp_cpus_mask, MAX_CPUS); + if (!test_and_set_bit(this_cpu, sched->map.comp_cpus_mask)) { + sched->map.comp_cpus[cpus_nr++] = this_cpu; + new_cpu = true; + } + } else + cpus_nr = sched->max_cpu; + timestamp0 = sched->cpu_last_switched[this_cpu]; sched->cpu_last_switched[this_cpu] = timestamp; if (timestamp0) @@ -1366,7 +1428,7 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel, return -1; } - sched_in = machine__findnew_thread(machine, -1, next_pid); + sched_in = map__findnew_thread(sched, machine, -1, next_pid); if (sched_in == NULL) return -1; @@ -1400,26 +1462,52 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel, new_shortname = 1; } - for (cpu = 0; cpu <= sched->max_cpu; cpu++) { + for (i = 0; i < cpus_nr; i++) { + int cpu = sched->map.comp ? sched->map.comp_cpus[i] : i; + struct thread *curr_thread = sched->curr_thread[cpu]; + const char *pid_color = color; + const char *cpu_color = color; + + if (curr_thread && thread__has_color(curr_thread)) + pid_color = COLOR_PIDS; + + if (sched->map.cpus && !cpu_map__has(sched->map.cpus, cpu)) + continue; + + if (sched->map.color_cpus && cpu_map__has(sched->map.color_cpus, cpu)) + cpu_color = COLOR_CPUS; + if (cpu != this_cpu) - printf(" "); + color_fprintf(stdout, cpu_color, " "); else - printf("*"); + color_fprintf(stdout, cpu_color, "*"); if (sched->curr_thread[cpu]) - printf("%2s ", sched->curr_thread[cpu]->shortname); + color_fprintf(stdout, pid_color, "%2s ", sched->curr_thread[cpu]->shortname); else - printf(" "); + color_fprintf(stdout, color, " "); } - printf(" %12.6f secs ", (double)timestamp/1e9); + if (sched->map.cpus && !cpu_map__has(sched->map.cpus, this_cpu)) + goto out; + + color_fprintf(stdout, color, " %12.6f secs ", (double)timestamp/1e9); if (new_shortname) { - printf("%s => %s:%d\n", + const char *pid_color = color; + + if (thread__has_color(sched_in)) + pid_color = COLOR_PIDS; + + color_fprintf(stdout, pid_color, "%s => %s:%d", sched_in->shortname, thread__comm_str(sched_in), sched_in->tid); - } else { - printf("\n"); } + if (sched->map.comp && new_cpu) + color_fprintf(stdout, color, " (CPU %d)", this_cpu); + +out: + color_fprintf(stdout, color, "\n"); + thread__put(sched_in); return 0; @@ -1675,9 +1763,75 @@ static int perf_sched__lat(struct perf_sched *sched) return 0; } +static int setup_map_cpus(struct perf_sched *sched) +{ + struct cpu_map *map; + + sched->max_cpu = sysconf(_SC_NPROCESSORS_CONF); + + if (sched->map.comp) { + sched->map.comp_cpus = zalloc(sched->max_cpu * sizeof(int)); + if (!sched->map.comp_cpus) + return -1; + } + + if (!sched->map.cpus_str) + return 0; + + map = cpu_map__new(sched->map.cpus_str); + if (!map) { + pr_err("failed to get cpus map from %s\n", sched->map.cpus_str); + return -1; + } + + sched->map.cpus = map; + return 0; +} + +static int setup_color_pids(struct perf_sched *sched) +{ + struct thread_map *map; + + if (!sched->map.color_pids_str) + return 0; + + map = thread_map__new_by_tid_str(sched->map.color_pids_str); + if (!map) { + pr_err("failed to get thread map from %s\n", sched->map.color_pids_str); + return -1; + } + + sched->map.color_pids = map; + return 0; +} + +static int setup_color_cpus(struct perf_sched *sched) +{ + struct cpu_map *map; + + if (!sched->map.color_cpus_str) + return 0; + + map = cpu_map__new(sched->map.color_cpus_str); + if (!map) { + pr_err("failed to get thread map from %s\n", sched->map.color_cpus_str); + return -1; + } + + sched->map.color_cpus = map; + return 0; +} + static int perf_sched__map(struct perf_sched *sched) { - sched->max_cpu = sysconf(_SC_NPROCESSORS_CONF); + if (setup_map_cpus(sched)) + return -1; + + if (setup_color_pids(sched)) + return -1; + + if (setup_color_cpus(sched)) + return -1; setup_pager(); if (perf_sched__read_events(sched)) @@ -1831,6 +1985,17 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused) "dump raw trace in ASCII"), OPT_END() }; + const struct option map_options[] = { + OPT_BOOLEAN(0, "compact", &sched.map.comp, + "map output in compact mode"), + OPT_STRING(0, "color-pids", &sched.map.color_pids_str, "pids", + "highlight given pids in map"), + OPT_STRING(0, "color-cpus", &sched.map.color_cpus_str, "cpus", + "highlight given CPUs in map"), + OPT_STRING(0, "cpus", &sched.map.cpus_str, "cpus", + "display given CPUs in map"), + OPT_END() + }; const char * const latency_usage[] = { "perf sched latency [<options>]", NULL @@ -1839,6 +2004,10 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused) "perf sched replay [<options>]", NULL }; + const char * const map_usage[] = { + "perf sched map [<options>]", + NULL + }; const char *const sched_subcommands[] = { "record", "latency", "map", "replay", "script", NULL }; const char *sched_usage[] = { @@ -1887,6 +2056,11 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused) setup_sorting(&sched, latency_options, latency_usage); return perf_sched__lat(&sched); } else if (!strcmp(argv[0], "map")) { + if (argc) { + argc = parse_options(argc, argv, map_options, map_usage, 0); + if (argc) + usage_with_options(map_usage, map_options); + } sched.tp_handler = &map_ops; setup_sorting(&sched, latency_options, latency_usage); return perf_sched__map(&sched); diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 3770c3dffe5e..efca81679bb3 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -22,6 +22,7 @@ #include "util/thread_map.h" #include "util/stat.h" #include <linux/bitmap.h> +#include <linux/stringify.h> #include "asm/bug.h" #include "util/mem-events.h" @@ -317,19 +318,19 @@ static void set_print_ip_opts(struct perf_event_attr *attr) output[type].print_ip_opts = 0; if (PRINT_FIELD(IP)) - output[type].print_ip_opts |= PRINT_IP_OPT_IP; + output[type].print_ip_opts |= EVSEL__PRINT_IP; if (PRINT_FIELD(SYM)) - output[type].print_ip_opts |= PRINT_IP_OPT_SYM; + output[type].print_ip_opts |= EVSEL__PRINT_SYM; if (PRINT_FIELD(DSO)) - output[type].print_ip_opts |= PRINT_IP_OPT_DSO; + output[type].print_ip_opts |= EVSEL__PRINT_DSO; if (PRINT_FIELD(SYMOFFSET)) - output[type].print_ip_opts |= PRINT_IP_OPT_SYMOFFSET; + output[type].print_ip_opts |= EVSEL__PRINT_SYMOFFSET; if (PRINT_FIELD(SRCLINE)) - output[type].print_ip_opts |= PRINT_IP_OPT_SRCLINE; + output[type].print_ip_opts |= EVSEL__PRINT_SRCLINE; } /* @@ -569,18 +570,23 @@ static void print_sample_bts(struct perf_sample *sample, /* print branch_from information */ if (PRINT_FIELD(IP)) { unsigned int print_opts = output[attr->type].print_ip_opts; + struct callchain_cursor *cursor = NULL; - if (symbol_conf.use_callchain && sample->callchain) { - printf("\n"); - } else { - printf(" "); - if (print_opts & PRINT_IP_OPT_SRCLINE) { + if (symbol_conf.use_callchain && sample->callchain && + thread__resolve_callchain(al->thread, &callchain_cursor, evsel, + sample, NULL, NULL, scripting_max_stack) == 0) + cursor = &callchain_cursor; + + if (cursor == NULL) { + putchar(' '); + if (print_opts & EVSEL__PRINT_SRCLINE) { print_srcline_last = true; - print_opts &= ~PRINT_IP_OPT_SRCLINE; + print_opts &= ~EVSEL__PRINT_SRCLINE; } - } - perf_evsel__print_ip(evsel, sample, al, print_opts, - scripting_max_stack); + } else + putchar('\n'); + + sample__fprintf_sym(sample, al, 0, print_opts, cursor, stdout); } /* print branch_to information */ @@ -783,14 +789,15 @@ static void process_event(struct perf_script *script, printf("%16" PRIu64, sample->weight); if (PRINT_FIELD(IP)) { - if (!symbol_conf.use_callchain) - printf(" "); - else - printf("\n"); + struct callchain_cursor *cursor = NULL; + + if (symbol_conf.use_callchain && sample->callchain && + thread__resolve_callchain(al->thread, &callchain_cursor, evsel, + sample, NULL, NULL, scripting_max_stack) == 0) + cursor = &callchain_cursor; - perf_evsel__print_ip(evsel, sample, al, - output[attr->type].print_ip_opts, - scripting_max_stack); + putchar(cursor ? '\n' : ' '); + sample__fprintf_sym(sample, al, 0, output[attr->type].print_ip_opts, cursor, stdout); } if (PRINT_FIELD(IREGS)) @@ -1415,21 +1422,19 @@ static int is_directory(const char *base_path, const struct dirent *dent) return S_ISDIR(st.st_mode); } -#define for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next)\ - while (!readdir_r(scripts_dir, &lang_dirent, &lang_next) && \ - lang_next) \ - if ((lang_dirent.d_type == DT_DIR || \ - (lang_dirent.d_type == DT_UNKNOWN && \ - is_directory(scripts_path, &lang_dirent))) && \ - (strcmp(lang_dirent.d_name, ".")) && \ - (strcmp(lang_dirent.d_name, ".."))) +#define for_each_lang(scripts_path, scripts_dir, lang_dirent) \ + while ((lang_dirent = readdir(scripts_dir)) != NULL) \ + if ((lang_dirent->d_type == DT_DIR || \ + (lang_dirent->d_type == DT_UNKNOWN && \ + is_directory(scripts_path, lang_dirent))) && \ + (strcmp(lang_dirent->d_name, ".")) && \ + (strcmp(lang_dirent->d_name, ".."))) -#define for_each_script(lang_path, lang_dir, script_dirent, script_next)\ - while (!readdir_r(lang_dir, &script_dirent, &script_next) && \ - script_next) \ - if (script_dirent.d_type != DT_DIR && \ - (script_dirent.d_type != DT_UNKNOWN || \ - !is_directory(lang_path, &script_dirent))) +#define for_each_script(lang_path, lang_dir, script_dirent) \ + while ((script_dirent = readdir(lang_dir)) != NULL) \ + if (script_dirent->d_type != DT_DIR && \ + (script_dirent->d_type != DT_UNKNOWN || \ + !is_directory(lang_path, script_dirent))) #define RECORD_SUFFIX "-record" @@ -1575,7 +1580,7 @@ static int list_available_scripts(const struct option *opt __maybe_unused, const char *s __maybe_unused, int unset __maybe_unused) { - struct dirent *script_next, *lang_next, script_dirent, lang_dirent; + struct dirent *script_dirent, *lang_dirent; char scripts_path[MAXPATHLEN]; DIR *scripts_dir, *lang_dir; char script_path[MAXPATHLEN]; @@ -1590,19 +1595,19 @@ static int list_available_scripts(const struct option *opt __maybe_unused, if (!scripts_dir) return -1; - for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next) { + for_each_lang(scripts_path, scripts_dir, lang_dirent) { snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path, - lang_dirent.d_name); + lang_dirent->d_name); lang_dir = opendir(lang_path); if (!lang_dir) continue; - for_each_script(lang_path, lang_dir, script_dirent, script_next) { - script_root = get_script_root(&script_dirent, REPORT_SUFFIX); + for_each_script(lang_path, lang_dir, script_dirent) { + script_root = get_script_root(script_dirent, REPORT_SUFFIX); if (script_root) { desc = script_desc__findnew(script_root); snprintf(script_path, MAXPATHLEN, "%s/%s", - lang_path, script_dirent.d_name); + lang_path, script_dirent->d_name); read_script_info(desc, script_path); free(script_root); } @@ -1690,7 +1695,7 @@ static int check_ev_match(char *dir_name, char *scriptname, */ int find_scripts(char **scripts_array, char **scripts_path_array) { - struct dirent *script_next, *lang_next, script_dirent, lang_dirent; + struct dirent *script_dirent, *lang_dirent; char scripts_path[MAXPATHLEN], lang_path[MAXPATHLEN]; DIR *scripts_dir, *lang_dir; struct perf_session *session; @@ -1713,9 +1718,9 @@ int find_scripts(char **scripts_array, char **scripts_path_array) return -1; } - for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next) { + for_each_lang(scripts_path, scripts_dir, lang_dirent) { snprintf(lang_path, MAXPATHLEN, "%s/%s", scripts_path, - lang_dirent.d_name); + lang_dirent->d_name); #ifdef NO_LIBPERL if (strstr(lang_path, "perl")) continue; @@ -1729,16 +1734,16 @@ int find_scripts(char **scripts_array, char **scripts_path_array) if (!lang_dir) continue; - for_each_script(lang_path, lang_dir, script_dirent, script_next) { + for_each_script(lang_path, lang_dir, script_dirent) { /* Skip those real time scripts: xxxtop.p[yl] */ - if (strstr(script_dirent.d_name, "top.")) + if (strstr(script_dirent->d_name, "top.")) continue; sprintf(scripts_path_array[i], "%s/%s", lang_path, - script_dirent.d_name); - temp = strchr(script_dirent.d_name, '.'); + script_dirent->d_name); + temp = strchr(script_dirent->d_name, '.'); snprintf(scripts_array[i], - (temp - script_dirent.d_name) + 1, - "%s", script_dirent.d_name); + (temp - script_dirent->d_name) + 1, + "%s", script_dirent->d_name); if (check_ev_match(lang_path, scripts_array[i], session)) @@ -1756,7 +1761,7 @@ int find_scripts(char **scripts_array, char **scripts_path_array) static char *get_script_path(const char *script_root, const char *suffix) { - struct dirent *script_next, *lang_next, script_dirent, lang_dirent; + struct dirent *script_dirent, *lang_dirent; char scripts_path[MAXPATHLEN]; char script_path[MAXPATHLEN]; DIR *scripts_dir, *lang_dir; @@ -1769,21 +1774,21 @@ static char *get_script_path(const char *script_root, const char *suffix) if (!scripts_dir) return NULL; - for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next) { + for_each_lang(scripts_path, scripts_dir, lang_dirent) { snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path, - lang_dirent.d_name); + lang_dirent->d_name); lang_dir = opendir(lang_path); if (!lang_dir) continue; - for_each_script(lang_path, lang_dir, script_dirent, script_next) { - __script_root = get_script_root(&script_dirent, suffix); + for_each_script(lang_path, lang_dir, script_dirent) { + __script_root = get_script_root(script_dirent, suffix); if (__script_root && !strcmp(script_root, __script_root)) { free(__script_root); closedir(lang_dir); closedir(scripts_dir); snprintf(script_path, MAXPATHLEN, "%s/%s", - lang_path, script_dirent.d_name); + lang_path, script_dirent->d_name); return strdup(script_path); } free(__script_root); @@ -1961,6 +1966,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused) .exit = perf_event__process_exit, .fork = perf_event__process_fork, .attr = process_attr, + .event_update = perf_event__process_event_update, .tracing_data = perf_event__process_tracing_data, .build_id = perf_event__process_build_id, .id_index = perf_event__process_id_index, @@ -2022,6 +2028,10 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused) "only consider symbols in these pids"), OPT_STRING(0, "tid", &symbol_conf.tid_list_str, "tid[,tid...]", "only consider symbols in these tids"), + OPT_UINTEGER(0, "max-stack", &scripting_max_stack, + "Set the maximum stack depth when parsing the callchain, " + "anything beyond the specified depth will be ignored. " + "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)), OPT_BOOLEAN('I', "show-info", &show_full_info, "display extended information from perf.data file"), OPT_BOOLEAN('\0', "show-kernel-path", &symbol_conf.show_kernel_path, @@ -2057,6 +2067,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused) NULL }; + scripting_max_stack = sysctl_perf_event_max_stack; + setup_scripting(); argc = parse_options_subcommand(argc, argv, options, script_subcommands, script_usage, diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 1f19f2f999c8..e459b685a4e9 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -298,6 +298,14 @@ static int read_counter(struct perf_evsel *counter) return -1; } } + + if (verbose > 1) { + fprintf(stat_config.output, + "%s: %d: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", + perf_evsel__name(counter), + cpu, + count->val, count->ena, count->run); + } } } @@ -528,6 +536,7 @@ static int __run_perf_stat(int argc, const char **argv) perf_evlist__set_leader(evsel_list); evlist__for_each(evsel_list, counter) { +try_again: if (create_perf_stat_counter(counter) < 0) { /* * PPC returns ENXIO for HW counters until 2.6.37 @@ -544,7 +553,11 @@ static int __run_perf_stat(int argc, const char **argv) if ((counter->leader != counter) || !(counter->leader->nr_members > 1)) continue; - } + } else if (perf_evsel__fallback(counter, errno, msg, sizeof(msg))) { + if (verbose) + ui__warning("%s\n", msg); + goto try_again; + } perf_evsel__open_strerror(counter, &target, errno, msg, sizeof(msg)); diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 833214979c4f..1793da585676 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -688,7 +688,7 @@ static int hist_iter__top_callback(struct hist_entry_iter *iter, struct hist_entry *he = iter->he; struct perf_evsel *evsel = iter->evsel; - if (sort__has_sym && single) + if (perf_hpp_list.sym && single) perf_top__record_precise_ip(top, he, evsel->idx, al->addr); hist__account_cycles(iter->sample->branch_stack, al, iter->sample, @@ -886,7 +886,7 @@ static int perf_top__start_counters(struct perf_top *top) struct perf_evlist *evlist = top->evlist; struct record_opts *opts = &top->record_opts; - perf_evlist__config(evlist, opts); + perf_evlist__config(evlist, opts, &callchain_param); evlist__for_each(evlist, counter) { try_again: @@ -917,15 +917,15 @@ out_err: return -1; } -static int perf_top__setup_sample_type(struct perf_top *top __maybe_unused) +static int callchain_param__setup_sample_type(struct callchain_param *callchain) { - if (!sort__has_sym) { - if (symbol_conf.use_callchain) { + if (!perf_hpp_list.sym) { + if (callchain->enabled) { ui__error("Selected -g but \"sym\" not present in --sort/-s."); return -EINVAL; } - } else if (callchain_param.mode != CHAIN_NONE) { - if (callchain_register_param(&callchain_param) < 0) { + } else if (callchain->mode != CHAIN_NONE) { + if (callchain_register_param(callchain) < 0) { ui__error("Can't register callchain params.\n"); return -EINVAL; } @@ -952,7 +952,7 @@ static int __cmd_top(struct perf_top *top) goto out_delete; } - ret = perf_top__setup_sample_type(top); + ret = callchain_param__setup_sample_type(&callchain_param); if (ret) goto out_delete; @@ -962,7 +962,7 @@ static int __cmd_top(struct perf_top *top) machine__synthesize_threads(&top->session->machines.host, &opts->target, top->evlist->threads, false, opts->proc_map_timeout); - if (sort__has_socket) { + if (perf_hpp_list.socket) { ret = perf_env__read_cpu_topology_map(&perf_env); if (ret < 0) goto out_err_cpu_topo; @@ -1045,18 +1045,17 @@ callchain_opt(const struct option *opt, const char *arg, int unset) static int parse_callchain_opt(const struct option *opt, const char *arg, int unset) { - struct record_opts *record = (struct record_opts *)opt->value; + struct callchain_param *callchain = opt->value; - record->callgraph_set = true; - callchain_param.enabled = !unset; - callchain_param.record_mode = CALLCHAIN_FP; + callchain->enabled = !unset; + callchain->record_mode = CALLCHAIN_FP; /* * --no-call-graph */ if (unset) { symbol_conf.use_callchain = false; - callchain_param.record_mode = CALLCHAIN_NONE; + callchain->record_mode = CALLCHAIN_NONE; return 0; } @@ -1104,7 +1103,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused) }, .proc_map_timeout = 500, }, - .max_stack = PERF_MAX_STACK_DEPTH, + .max_stack = sysctl_perf_event_max_stack, .sym_pcnt_filter = 5, }; struct record_opts *opts = &top.record_opts; @@ -1162,17 +1161,17 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused) "output field(s): overhead, period, sample plus all of sort keys"), OPT_BOOLEAN('n', "show-nr-samples", &symbol_conf.show_nr_samples, "Show a column with the number of samples"), - OPT_CALLBACK_NOOPT('g', NULL, &top.record_opts, + OPT_CALLBACK_NOOPT('g', NULL, &callchain_param, NULL, "enables call-graph recording and display", &callchain_opt), - OPT_CALLBACK(0, "call-graph", &top.record_opts, + OPT_CALLBACK(0, "call-graph", &callchain_param, "record_mode[,record_size],print_type,threshold[,print_limit],order,sort_key[,branch]", top_callchain_help, &parse_callchain_opt), OPT_BOOLEAN(0, "children", &symbol_conf.cumulate_callchain, "Accumulate callchains of children and show total overhead as well"), OPT_INTEGER(0, "max-stack", &top.max_stack, "Set the maximum stack depth when parsing the callchain. " - "Default: " __stringify(PERF_MAX_STACK_DEPTH)), + "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)), OPT_CALLBACK(0, "ignore-callees", NULL, "regex", "ignore callees of these functions in call graphs", report_parse_ignore_callees_opt), @@ -1256,7 +1255,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused) sort__mode = SORT_MODE__TOP; /* display thread wants entries to be collapsed in a different tree */ - sort__need_collapse = 1; + perf_hpp_list.need_collapse = 1; if (top.use_stdio) use_browser = 0; @@ -1312,7 +1311,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused) top.sym_evsel = perf_evlist__first(top.evlist); - if (!symbol_conf.use_callchain) { + if (!callchain_param.enabled) { symbol_conf.cumulate_callchain = false; perf_hpp__cancel_cumulate(); } diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 93ac724fb635..6e5c325148e4 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -34,79 +34,76 @@ #include "trace-event.h" #include "util/parse-events.h" #include "util/bpf-loader.h" +#include "callchain.h" +#include "syscalltbl.h" +#include "rb_resort.h" -#include <libaudit.h> +#include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */ #include <stdlib.h> -#include <sys/mman.h> -#include <linux/futex.h> #include <linux/err.h> - -/* For older distros: */ -#ifndef MAP_STACK -# define MAP_STACK 0x20000 -#endif - -#ifndef MADV_HWPOISON -# define MADV_HWPOISON 100 - -#endif - -#ifndef MADV_MERGEABLE -# define MADV_MERGEABLE 12 -#endif - -#ifndef MADV_UNMERGEABLE -# define MADV_UNMERGEABLE 13 -#endif - -#ifndef EFD_SEMAPHORE -# define EFD_SEMAPHORE 1 -#endif - -#ifndef EFD_NONBLOCK -# define EFD_NONBLOCK 00004000 -#endif - -#ifndef EFD_CLOEXEC -# define EFD_CLOEXEC 02000000 -#endif +#include <linux/filter.h> +#include <linux/audit.h> +#include <sys/ptrace.h> +#include <linux/random.h> +#include <linux/stringify.h> #ifndef O_CLOEXEC # define O_CLOEXEC 02000000 #endif -#ifndef SOCK_DCCP -# define SOCK_DCCP 6 -#endif - -#ifndef SOCK_CLOEXEC -# define SOCK_CLOEXEC 02000000 -#endif - -#ifndef SOCK_NONBLOCK -# define SOCK_NONBLOCK 00004000 -#endif - -#ifndef MSG_CMSG_CLOEXEC -# define MSG_CMSG_CLOEXEC 0x40000000 -#endif - -#ifndef PERF_FLAG_FD_NO_GROUP -# define PERF_FLAG_FD_NO_GROUP (1UL << 0) -#endif - -#ifndef PERF_FLAG_FD_OUTPUT -# define PERF_FLAG_FD_OUTPUT (1UL << 1) -#endif - -#ifndef PERF_FLAG_PID_CGROUP -# define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */ -#endif - -#ifndef PERF_FLAG_FD_CLOEXEC -# define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */ -#endif - +struct trace { + struct perf_tool tool; + struct syscalltbl *sctbl; + struct { + int max; + struct syscall *table; + struct { + struct perf_evsel *sys_enter, + *sys_exit; + } events; + } syscalls; + struct record_opts opts; + struct perf_evlist *evlist; + struct machine *host; + struct thread *current; + u64 base_time; + FILE *output; + unsigned long nr_events; + struct strlist *ev_qualifier; + struct { + size_t nr; + int *entries; + } ev_qualifier_ids; + struct intlist *tid_list; + struct intlist *pid_list; + struct { + size_t nr; + pid_t *entries; + } filter_pids; + double duration_filter; + double runtime_ms; + struct { + u64 vfs_getname, + proc_getname; + } stats; + unsigned int max_stack; + unsigned int min_stack; + bool not_ev_qualifier; + bool live; + bool full_time; + bool sched; + bool multiple_threads; + bool summary; + bool summary_only; + bool show_comm; + bool show_tool_stats; + bool trace_syscalls; + bool kernel_syscallchains; + bool force; + bool vfs_getname; + int trace_pgfaults; + int open_id; +}; struct tp_field { int offset; @@ -371,221 +368,6 @@ static size_t syscall_arg__scnprintf_int(char *bf, size_t size, #define SCA_INT syscall_arg__scnprintf_int -static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size, - struct syscall_arg *arg) -{ - int printed = 0, prot = arg->val; - - if (prot == PROT_NONE) - return scnprintf(bf, size, "NONE"); -#define P_MMAP_PROT(n) \ - if (prot & PROT_##n) { \ - printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ - prot &= ~PROT_##n; \ - } - - P_MMAP_PROT(EXEC); - P_MMAP_PROT(READ); - P_MMAP_PROT(WRITE); -#ifdef PROT_SEM - P_MMAP_PROT(SEM); -#endif - P_MMAP_PROT(GROWSDOWN); - P_MMAP_PROT(GROWSUP); -#undef P_MMAP_PROT - - if (prot) - printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot); - - return printed; -} - -#define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot - -static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size, - struct syscall_arg *arg) -{ - int printed = 0, flags = arg->val; - -#define P_MMAP_FLAG(n) \ - if (flags & MAP_##n) { \ - printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ - flags &= ~MAP_##n; \ - } - - P_MMAP_FLAG(SHARED); - P_MMAP_FLAG(PRIVATE); -#ifdef MAP_32BIT - P_MMAP_FLAG(32BIT); -#endif - P_MMAP_FLAG(ANONYMOUS); - P_MMAP_FLAG(DENYWRITE); - P_MMAP_FLAG(EXECUTABLE); - P_MMAP_FLAG(FILE); - P_MMAP_FLAG(FIXED); - P_MMAP_FLAG(GROWSDOWN); -#ifdef MAP_HUGETLB - P_MMAP_FLAG(HUGETLB); -#endif - P_MMAP_FLAG(LOCKED); - P_MMAP_FLAG(NONBLOCK); - P_MMAP_FLAG(NORESERVE); - P_MMAP_FLAG(POPULATE); - P_MMAP_FLAG(STACK); -#ifdef MAP_UNINITIALIZED - P_MMAP_FLAG(UNINITIALIZED); -#endif -#undef P_MMAP_FLAG - - if (flags) - printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); - - return printed; -} - -#define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags - -static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size, - struct syscall_arg *arg) -{ - int printed = 0, flags = arg->val; - -#define P_MREMAP_FLAG(n) \ - if (flags & MREMAP_##n) { \ - printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ - flags &= ~MREMAP_##n; \ - } - - P_MREMAP_FLAG(MAYMOVE); -#ifdef MREMAP_FIXED - P_MREMAP_FLAG(FIXED); -#endif -#undef P_MREMAP_FLAG - - if (flags) - printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); - - return printed; -} - -#define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags - -static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size, - struct syscall_arg *arg) -{ - int behavior = arg->val; - - switch (behavior) { -#define P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n) - P_MADV_BHV(NORMAL); - P_MADV_BHV(RANDOM); - P_MADV_BHV(SEQUENTIAL); - P_MADV_BHV(WILLNEED); - P_MADV_BHV(DONTNEED); - P_MADV_BHV(REMOVE); - P_MADV_BHV(DONTFORK); - P_MADV_BHV(DOFORK); - P_MADV_BHV(HWPOISON); -#ifdef MADV_SOFT_OFFLINE - P_MADV_BHV(SOFT_OFFLINE); -#endif - P_MADV_BHV(MERGEABLE); - P_MADV_BHV(UNMERGEABLE); -#ifdef MADV_HUGEPAGE - P_MADV_BHV(HUGEPAGE); -#endif -#ifdef MADV_NOHUGEPAGE - P_MADV_BHV(NOHUGEPAGE); -#endif -#ifdef MADV_DONTDUMP - P_MADV_BHV(DONTDUMP); -#endif -#ifdef MADV_DODUMP - P_MADV_BHV(DODUMP); -#endif -#undef P_MADV_PHV - default: break; - } - - return scnprintf(bf, size, "%#x", behavior); -} - -#define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior - -static size_t syscall_arg__scnprintf_flock(char *bf, size_t size, - struct syscall_arg *arg) -{ - int printed = 0, op = arg->val; - - if (op == 0) - return scnprintf(bf, size, "NONE"); -#define P_CMD(cmd) \ - if ((op & LOCK_##cmd) == LOCK_##cmd) { \ - printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \ - op &= ~LOCK_##cmd; \ - } - - P_CMD(SH); - P_CMD(EX); - P_CMD(NB); - P_CMD(UN); - P_CMD(MAND); - P_CMD(RW); - P_CMD(READ); - P_CMD(WRITE); -#undef P_OP - - if (op) - printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op); - - return printed; -} - -#define SCA_FLOCK syscall_arg__scnprintf_flock - -static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg) -{ - enum syscall_futex_args { - SCF_UADDR = (1 << 0), - SCF_OP = (1 << 1), - SCF_VAL = (1 << 2), - SCF_TIMEOUT = (1 << 3), - SCF_UADDR2 = (1 << 4), - SCF_VAL3 = (1 << 5), - }; - int op = arg->val; - int cmd = op & FUTEX_CMD_MASK; - size_t printed = 0; - - switch (cmd) { -#define P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n); - P_FUTEX_OP(WAIT); arg->mask |= SCF_VAL3|SCF_UADDR2; break; - P_FUTEX_OP(WAKE); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break; - P_FUTEX_OP(FD); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break; - P_FUTEX_OP(REQUEUE); arg->mask |= SCF_VAL3|SCF_TIMEOUT; break; - P_FUTEX_OP(CMP_REQUEUE); arg->mask |= SCF_TIMEOUT; break; - P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT; break; - P_FUTEX_OP(WAKE_OP); break; - P_FUTEX_OP(LOCK_PI); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break; - P_FUTEX_OP(UNLOCK_PI); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break; - P_FUTEX_OP(TRYLOCK_PI); arg->mask |= SCF_VAL3|SCF_UADDR2; break; - P_FUTEX_OP(WAIT_BITSET); arg->mask |= SCF_UADDR2; break; - P_FUTEX_OP(WAKE_BITSET); arg->mask |= SCF_UADDR2; break; - P_FUTEX_OP(WAIT_REQUEUE_PI); break; - default: printed = scnprintf(bf, size, "%#x", cmd); break; - } - - if (op & FUTEX_PRIVATE_FLAG) - printed += scnprintf(bf + printed, size - printed, "|PRIV"); - - if (op & FUTEX_CLOCK_REALTIME) - printed += scnprintf(bf + printed, size - printed, "|CLKRT"); - - return printed; -} - -#define SCA_FUTEX_OP syscall_arg__scnprintf_futex_op - static const char *bpf_cmd[] = { "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM", "MAP_GET_NEXT_KEY", "PROG_LOAD", @@ -652,110 +434,6 @@ static const char *socket_families[] = { }; static DEFINE_STRARRAY(socket_families); -#ifndef SOCK_TYPE_MASK -#define SOCK_TYPE_MASK 0xf -#endif - -static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size, - struct syscall_arg *arg) -{ - size_t printed; - int type = arg->val, - flags = type & ~SOCK_TYPE_MASK; - - type &= SOCK_TYPE_MASK; - /* - * Can't use a strarray, MIPS may override for ABI reasons. - */ - switch (type) { -#define P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break; - P_SK_TYPE(STREAM); - P_SK_TYPE(DGRAM); - P_SK_TYPE(RAW); - P_SK_TYPE(RDM); - P_SK_TYPE(SEQPACKET); - P_SK_TYPE(DCCP); - P_SK_TYPE(PACKET); -#undef P_SK_TYPE - default: - printed = scnprintf(bf, size, "%#x", type); - } - -#define P_SK_FLAG(n) \ - if (flags & SOCK_##n) { \ - printed += scnprintf(bf + printed, size - printed, "|%s", #n); \ - flags &= ~SOCK_##n; \ - } - - P_SK_FLAG(CLOEXEC); - P_SK_FLAG(NONBLOCK); -#undef P_SK_FLAG - - if (flags) - printed += scnprintf(bf + printed, size - printed, "|%#x", flags); - - return printed; -} - -#define SCA_SK_TYPE syscall_arg__scnprintf_socket_type - -#ifndef MSG_PROBE -#define MSG_PROBE 0x10 -#endif -#ifndef MSG_WAITFORONE -#define MSG_WAITFORONE 0x10000 -#endif -#ifndef MSG_SENDPAGE_NOTLAST -#define MSG_SENDPAGE_NOTLAST 0x20000 -#endif -#ifndef MSG_FASTOPEN -#define MSG_FASTOPEN 0x20000000 -#endif - -static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size, - struct syscall_arg *arg) -{ - int printed = 0, flags = arg->val; - - if (flags == 0) - return scnprintf(bf, size, "NONE"); -#define P_MSG_FLAG(n) \ - if (flags & MSG_##n) { \ - printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ - flags &= ~MSG_##n; \ - } - - P_MSG_FLAG(OOB); - P_MSG_FLAG(PEEK); - P_MSG_FLAG(DONTROUTE); - P_MSG_FLAG(TRYHARD); - P_MSG_FLAG(CTRUNC); - P_MSG_FLAG(PROBE); - P_MSG_FLAG(TRUNC); - P_MSG_FLAG(DONTWAIT); - P_MSG_FLAG(EOR); - P_MSG_FLAG(WAITALL); - P_MSG_FLAG(FIN); - P_MSG_FLAG(SYN); - P_MSG_FLAG(CONFIRM); - P_MSG_FLAG(RST); - P_MSG_FLAG(ERRQUEUE); - P_MSG_FLAG(NOSIGNAL); - P_MSG_FLAG(MORE); - P_MSG_FLAG(WAITFORONE); - P_MSG_FLAG(SENDPAGE_NOTLAST); - P_MSG_FLAG(FASTOPEN); - P_MSG_FLAG(CMSG_CLOEXEC); -#undef P_MSG_FLAG - - if (flags) - printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); - - return printed; -} - -#define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags - static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size, struct syscall_arg *arg) { @@ -788,116 +466,6 @@ static size_t syscall_arg__scnprintf_filename(char *bf, size_t size, #define SCA_FILENAME syscall_arg__scnprintf_filename -static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size, - struct syscall_arg *arg) -{ - int printed = 0, flags = arg->val; - - if (!(flags & O_CREAT)) - arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */ - - if (flags == 0) - return scnprintf(bf, size, "RDONLY"); -#define P_FLAG(n) \ - if (flags & O_##n) { \ - printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ - flags &= ~O_##n; \ - } - - P_FLAG(APPEND); - P_FLAG(ASYNC); - P_FLAG(CLOEXEC); - P_FLAG(CREAT); - P_FLAG(DIRECT); - P_FLAG(DIRECTORY); - P_FLAG(EXCL); - P_FLAG(LARGEFILE); - P_FLAG(NOATIME); - P_FLAG(NOCTTY); -#ifdef O_NONBLOCK - P_FLAG(NONBLOCK); -#elif O_NDELAY - P_FLAG(NDELAY); -#endif -#ifdef O_PATH - P_FLAG(PATH); -#endif - P_FLAG(RDWR); -#ifdef O_DSYNC - if ((flags & O_SYNC) == O_SYNC) - printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC"); - else { - P_FLAG(DSYNC); - } -#else - P_FLAG(SYNC); -#endif - P_FLAG(TRUNC); - P_FLAG(WRONLY); -#undef P_FLAG - - if (flags) - printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); - - return printed; -} - -#define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags - -static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size, - struct syscall_arg *arg) -{ - int printed = 0, flags = arg->val; - - if (flags == 0) - return 0; - -#define P_FLAG(n) \ - if (flags & PERF_FLAG_##n) { \ - printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ - flags &= ~PERF_FLAG_##n; \ - } - - P_FLAG(FD_NO_GROUP); - P_FLAG(FD_OUTPUT); - P_FLAG(PID_CGROUP); - P_FLAG(FD_CLOEXEC); -#undef P_FLAG - - if (flags) - printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); - - return printed; -} - -#define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags - -static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size, - struct syscall_arg *arg) -{ - int printed = 0, flags = arg->val; - - if (flags == 0) - return scnprintf(bf, size, "NONE"); -#define P_FLAG(n) \ - if (flags & EFD_##n) { \ - printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ - flags &= ~EFD_##n; \ - } - - P_FLAG(SEMAPHORE); - P_FLAG(CLOEXEC); - P_FLAG(NONBLOCK); -#undef P_FLAG - - if (flags) - printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); - - return printed; -} - -#define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags - static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size, struct syscall_arg *arg) { @@ -921,59 +489,6 @@ static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size, #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags -static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg) -{ - int sig = arg->val; - - switch (sig) { -#define P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n) - P_SIGNUM(HUP); - P_SIGNUM(INT); - P_SIGNUM(QUIT); - P_SIGNUM(ILL); - P_SIGNUM(TRAP); - P_SIGNUM(ABRT); - P_SIGNUM(BUS); - P_SIGNUM(FPE); - P_SIGNUM(KILL); - P_SIGNUM(USR1); - P_SIGNUM(SEGV); - P_SIGNUM(USR2); - P_SIGNUM(PIPE); - P_SIGNUM(ALRM); - P_SIGNUM(TERM); - P_SIGNUM(CHLD); - P_SIGNUM(CONT); - P_SIGNUM(STOP); - P_SIGNUM(TSTP); - P_SIGNUM(TTIN); - P_SIGNUM(TTOU); - P_SIGNUM(URG); - P_SIGNUM(XCPU); - P_SIGNUM(XFSZ); - P_SIGNUM(VTALRM); - P_SIGNUM(PROF); - P_SIGNUM(WINCH); - P_SIGNUM(IO); - P_SIGNUM(PWR); - P_SIGNUM(SYS); -#ifdef SIGEMT - P_SIGNUM(EMT); -#endif -#ifdef SIGSTKFLT - P_SIGNUM(STKFLT); -#endif -#ifdef SIGSWI - P_SIGNUM(SWI); -#endif - default: break; - } - - return scnprintf(bf, size, "%#x", sig); -} - -#define SCA_SIGNUM syscall_arg__scnprintf_signum - #if defined(__i386__) || defined(__x86_64__) /* * FIXME: Make this available to all arches. @@ -1001,16 +516,62 @@ static const char *tioctls[] = { static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401); #endif /* defined(__i386__) || defined(__x86_64__) */ +#ifndef GRND_NONBLOCK +#define GRND_NONBLOCK 0x0001 +#endif +#ifndef GRND_RANDOM +#define GRND_RANDOM 0x0002 +#endif + +static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, + struct syscall_arg *arg) +{ + int printed = 0, flags = arg->val; + +#define P_FLAG(n) \ + if (flags & GRND_##n) { \ + printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ + flags &= ~GRND_##n; \ + } + + P_FLAG(RANDOM); + P_FLAG(NONBLOCK); +#undef P_FLAG + + if (flags) + printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); + + return printed; +} + +#define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags + #define STRARRAY(arg, name, array) \ .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \ .arg_parm = { [arg] = &strarray__##array, } +#include "trace/beauty/eventfd.c" +#include "trace/beauty/flock.c" +#include "trace/beauty/futex_op.c" +#include "trace/beauty/mmap.c" +#include "trace/beauty/mode_t.c" +#include "trace/beauty/msg_flags.c" +#include "trace/beauty/open_flags.c" +#include "trace/beauty/perf_event_open.c" +#include "trace/beauty/pid.c" +#include "trace/beauty/sched_policy.c" +#include "trace/beauty/seccomp.c" +#include "trace/beauty/signum.c" +#include "trace/beauty/socket_type.c" +#include "trace/beauty/waitid_options.c" + static struct syscall_fmt { const char *name; const char *alias; size_t (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg); void *arg_parm[6]; bool errmsg; + bool errpid; bool timeout; bool hexret; } syscall_fmts[] = { @@ -1028,6 +589,7 @@ static struct syscall_fmt { { .name = "chroot", .errmsg = true, .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, }, { .name = "clock_gettime", .errmsg = true, STRARRAY(0, clk_id, clockid), }, + { .name = "clone", .errpid = true, }, { .name = "close", .errmsg = true, .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, }, { .name = "connect", .errmsg = true, }, @@ -1093,6 +655,11 @@ static struct syscall_fmt { { .name = "getdents64", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, { .name = "getitimer", .errmsg = true, STRARRAY(0, which, itimers), }, + { .name = "getpid", .errpid = true, }, + { .name = "getpgid", .errpid = true, }, + { .name = "getppid", .errpid = true, }, + { .name = "getrandom", .errmsg = true, + .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, }, { .name = "getrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), }, { .name = "getxattr", .errmsg = true, .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, }, @@ -1186,8 +753,7 @@ static struct syscall_fmt { [1] = SCA_FILENAME, /* filename */ [2] = SCA_OPEN_FLAGS, /* flags */ }, }, { .name = "perf_event_open", .errmsg = true, - .arg_scnprintf = { [1] = SCA_INT, /* pid */ - [2] = SCA_INT, /* cpu */ + .arg_scnprintf = { [2] = SCA_INT, /* cpu */ [3] = SCA_FD, /* group_fd */ [4] = SCA_PERF_FLAGS, /* flags */ }, }, { .name = "pipe2", .errmsg = true, @@ -1234,6 +800,11 @@ static struct syscall_fmt { .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, }, { .name = "rt_tgsigqueueinfo", .errmsg = true, .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, }, + { .name = "sched_setscheduler", .errmsg = true, + .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, }, + { .name = "seccomp", .errmsg = true, + .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */ + [1] = SCA_SECCOMP_FLAGS, /* flags */ }, }, { .name = "select", .errmsg = true, .timeout = true, }, { .name = "sendmmsg", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ @@ -1244,7 +815,9 @@ static struct syscall_fmt { { .name = "sendto", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ [3] = SCA_MSG_FLAGS, /* flags */ }, }, + { .name = "set_tid_address", .errpid = true, }, { .name = "setitimer", .errmsg = true, STRARRAY(0, which, itimers), }, + { .name = "setpgid", .errmsg = true, }, { .name = "setrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), }, { .name = "setxattr", .errmsg = true, .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, }, @@ -1287,6 +860,10 @@ static struct syscall_fmt { .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, }, { .name = "vmsplice", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, + { .name = "wait4", .errpid = true, + .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, }, + { .name = "waitid", .errpid = true, + .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, }, { .name = "write", .errmsg = true, .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, { .name = "writev", .errmsg = true, @@ -1398,59 +975,6 @@ fail: static const size_t trace__entry_str_size = 2048; -struct trace { - struct perf_tool tool; - struct { - int machine; - int open_id; - } audit; - struct { - int max; - struct syscall *table; - struct { - struct perf_evsel *sys_enter, - *sys_exit; - } events; - } syscalls; - struct record_opts opts; - struct perf_evlist *evlist; - struct machine *host; - struct thread *current; - u64 base_time; - FILE *output; - unsigned long nr_events; - struct strlist *ev_qualifier; - struct { - size_t nr; - int *entries; - } ev_qualifier_ids; - struct intlist *tid_list; - struct intlist *pid_list; - struct { - size_t nr; - pid_t *entries; - } filter_pids; - double duration_filter; - double runtime_ms; - struct { - u64 vfs_getname, - proc_getname; - } stats; - bool not_ev_qualifier; - bool live; - bool full_time; - bool sched; - bool multiple_threads; - bool summary; - bool summary_only; - bool show_comm; - bool show_tool_stats; - bool trace_syscalls; - bool force; - bool vfs_getname; - int trace_pgfaults; -}; - static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname) { struct thread_trace *ttrace = thread__priv(thread); @@ -1618,6 +1142,7 @@ static int trace__process_event(struct trace *trace, struct machine *machine, color_fprintf(trace->output, PERF_COLOR_RED, "LOST %" PRIu64 " events!\n", event->lost.lost); ret = machine__process_lost_event(machine, event, sample); + break; default: ret = machine__process_event(machine, event, sample); break; @@ -1675,6 +1200,10 @@ static int syscall__set_arg_fmts(struct syscall *sc) sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx]; else if (field->flags & FIELD_IS_POINTER) sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex; + else if (strcmp(field->type, "pid_t") == 0) + sc->arg_scnprintf[idx] = SCA_PID; + else if (strcmp(field->type, "umode_t") == 0) + sc->arg_scnprintf[idx] = SCA_MODE_T; ++idx; } @@ -1685,7 +1214,7 @@ static int trace__read_syscall_info(struct trace *trace, int id) { char tp_name[128]; struct syscall *sc; - const char *name = audit_syscall_to_name(id, trace->audit.machine); + const char *name = syscalltbl__name(trace->sctbl, id); if (name == NULL) return -1; @@ -1760,7 +1289,7 @@ static int trace__validate_ev_qualifier(struct trace *trace) strlist__for_each(pos, trace->ev_qualifier) { const char *sc = pos->s; - int id = audit_name_to_syscall(sc, trace->audit.machine); + int id = syscalltbl__id(trace->sctbl, sc); if (id < 0) { if (err == 0) { @@ -1846,7 +1375,12 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, "%ld", val); } } - } else { + } else if (IS_ERR(sc->tp_format)) { + /* + * If we managed to read the tracepoint /format file, then we + * may end up not having any args, like with gettid(), so only + * print the raw args when we didn't manage to read it. + */ int i = 0; while (i < 6) { @@ -1987,7 +1521,7 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel, goto out_put; } - if (!trace->summary_only) + if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) trace__printf_interrupted_entry(trace, sample); ttrace->entry_time = sample->time; @@ -1998,7 +1532,7 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel, args, trace, thread); if (sc->is_exit) { - if (!trace->duration_filter && !trace->summary_only) { + if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) { trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output); fprintf(trace->output, "%-70s\n", ttrace->entry_str); } @@ -2018,6 +1552,29 @@ out_put: return err; } +static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel, + struct perf_sample *sample, + struct callchain_cursor *cursor) +{ + struct addr_location al; + + if (machine__resolve(trace->host, &al, sample) < 0 || + thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack)) + return -1; + + return 0; +} + +static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample) +{ + /* TODO: user-configurable print_opts */ + const unsigned int print_opts = EVSEL__PRINT_SYM | + EVSEL__PRINT_DSO | + EVSEL__PRINT_UNKNOWN_AS_ADDR; + + return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output); +} + static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, union perf_event *event __maybe_unused, struct perf_sample *sample) @@ -2025,7 +1582,7 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, long ret; u64 duration = 0; struct thread *thread; - int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1; + int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0; struct syscall *sc = trace__syscall_info(trace, evsel, id); struct thread_trace *ttrace; @@ -2042,7 +1599,7 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, ret = perf_evsel__sc_tp_uint(evsel, ret, sample); - if (id == trace->audit.open_id && ret >= 0 && ttrace->filename.pending_open) { + if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) { trace__set_fd_pathname(thread, ret, ttrace->filename.name); ttrace->filename.pending_open = false; ++trace->stats.vfs_getname; @@ -2057,6 +1614,15 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, } else if (trace->duration_filter) goto out; + if (sample->callchain) { + callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); + if (callchain_ret == 0) { + if (callchain_cursor.nr < trace->min_stack) + goto out; + callchain_ret = 1; + } + } + if (trace->summary_only) goto out; @@ -2073,7 +1639,7 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, if (sc->fmt == NULL) { signed_print: fprintf(trace->output, ") = %ld", ret); - } else if (ret < 0 && sc->fmt->errmsg) { + } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) { char bf[STRERR_BUFSIZE]; const char *emsg = strerror_r(-ret, bf, sizeof(bf)), *e = audit_errno_to_name(-ret); @@ -2083,10 +1649,24 @@ signed_print: fprintf(trace->output, ") = 0 Timeout"); else if (sc->fmt->hexret) fprintf(trace->output, ") = %#lx", ret); - else + else if (sc->fmt->errpid) { + struct thread *child = machine__find_thread(trace->host, ret, ret); + + if (child != NULL) { + fprintf(trace->output, ") = %ld", ret); + if (child->comm_set) + fprintf(trace->output, " (%s)", thread__comm_str(child)); + thread__put(child); + } + } else goto signed_print; fputc('\n', trace->output); + + if (callchain_ret > 0) + trace__fprintf_callchain(trace, sample); + else if (callchain_ret < 0) + pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); out: ttrace->entry_pending = false; err = 0; @@ -2217,6 +1797,17 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel, union perf_event *event __maybe_unused, struct perf_sample *sample) { + int callchain_ret = 0; + + if (sample->callchain) { + callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); + if (callchain_ret == 0) { + if (callchain_cursor.nr < trace->min_stack) + goto out; + callchain_ret = 1; + } + } + trace__printf_interrupted_entry(trace, sample); trace__fprintf_tstamp(trace, sample->time, trace->output); @@ -2234,6 +1825,12 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel, } fprintf(trace->output, ")\n"); + + if (callchain_ret > 0) + trace__fprintf_callchain(trace, sample); + else if (callchain_ret < 0) + pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); +out: return 0; } @@ -2264,8 +1861,19 @@ static int trace__pgfault(struct trace *trace, char map_type = 'd'; struct thread_trace *ttrace; int err = -1; + int callchain_ret = 0; thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); + + if (sample->callchain) { + callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); + if (callchain_ret == 0) { + if (callchain_cursor.nr < trace->min_stack) + goto out_put; + callchain_ret = 1; + } + } + ttrace = thread__trace(thread, trace->output); if (ttrace == NULL) goto out_put; @@ -2307,6 +1915,11 @@ static int trace__pgfault(struct trace *trace, print_location(trace->output, sample, &al, true, false); fprintf(trace->output, " (%c%c)\n", map_type, al.level); + + if (callchain_ret > 0) + trace__fprintf_callchain(trace, sample); + else if (callchain_ret < 0) + pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); out: err = 0; out_put: @@ -2326,6 +1939,23 @@ static bool skip_sample(struct trace *trace, struct perf_sample *sample) return false; } +static void trace__set_base_time(struct trace *trace, + struct perf_evsel *evsel, + struct perf_sample *sample) +{ + /* + * BPF events were not setting PERF_SAMPLE_TIME, so be more robust + * and don't use sample->time unconditionally, we may end up having + * some other event in the future without PERF_SAMPLE_TIME for good + * reason, i.e. we may not be interested in its timestamps, just in + * it taking place, picking some piece of information when it + * appears in our event stream (vfs_getname comes to mind). + */ + if (trace->base_time == 0 && !trace->full_time && + (evsel->attr.sample_type & PERF_SAMPLE_TIME)) + trace->base_time = sample->time; +} + static int trace__process_sample(struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, @@ -2340,8 +1970,7 @@ static int trace__process_sample(struct perf_tool *tool, if (skip_sample(trace, sample)) return 0; - if (!trace->full_time && trace->base_time == 0) - trace->base_time = sample->time; + trace__set_base_time(trace, evsel, sample); if (handler) { ++trace->nr_events; @@ -2450,8 +2079,7 @@ static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist) return true; } -static int perf_evlist__add_pgfault(struct perf_evlist *evlist, - u64 config) +static struct perf_evsel *perf_evsel__new_pgfault(u64 config) { struct perf_evsel *evsel; struct perf_event_attr attr = { @@ -2465,13 +2093,10 @@ static int perf_evlist__add_pgfault(struct perf_evlist *evlist, event_attr_init(&attr); evsel = perf_evsel__new(&attr); - if (!evsel) - return -ENOMEM; - - evsel->handler = trace__pgfault; - perf_evlist__add(evlist, evsel); + if (evsel) + evsel->handler = trace__pgfault; - return 0; + return evsel; } static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample) @@ -2479,9 +2104,6 @@ static void trace__handle_event(struct trace *trace, union perf_event *event, st const u32 type = event->header.type; struct perf_evsel *evsel; - if (!trace->full_time && trace->base_time == 0) - trace->base_time = sample->time; - if (type != PERF_RECORD_SAMPLE) { trace__process_event(trace, trace->host, event, sample); return; @@ -2493,6 +2115,8 @@ static void trace__handle_event(struct trace *trace, union perf_event *event, st return; } + trace__set_base_time(trace, evsel, sample); + if (evsel->attr.type == PERF_TYPE_TRACEPOINT && sample->raw_data == NULL) { fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n", @@ -2527,6 +2151,15 @@ static int trace__add_syscall_newtp(struct trace *trace) perf_evlist__add(evlist, sys_enter); perf_evlist__add(evlist, sys_exit); + if (callchain_param.enabled && !trace->kernel_syscallchains) { + /* + * We're interested only in the user space callchain + * leading to the syscall, allow overriding that for + * debugging reasons using --kernel_syscall_callchains + */ + sys_exit->attr.exclude_callchain_kernel = 1; + } + trace->syscalls.events.sys_enter = sys_enter; trace->syscalls.events.sys_exit = sys_exit; @@ -2565,7 +2198,7 @@ out_enomem: static int trace__run(struct trace *trace, int argc, const char **argv) { struct perf_evlist *evlist = trace->evlist; - struct perf_evsel *evsel; + struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL; int err = -1, i; unsigned long before; const bool forks = argc > 0; @@ -2579,14 +2212,19 @@ static int trace__run(struct trace *trace, int argc, const char **argv) if (trace->trace_syscalls) trace->vfs_getname = perf_evlist__add_vfs_getname(evlist); - if ((trace->trace_pgfaults & TRACE_PFMAJ) && - perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ)) { - goto out_error_mem; + if ((trace->trace_pgfaults & TRACE_PFMAJ)) { + pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ); + if (pgfault_maj == NULL) + goto out_error_mem; + perf_evlist__add(evlist, pgfault_maj); } - if ((trace->trace_pgfaults & TRACE_PFMIN) && - perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN)) - goto out_error_mem; + if ((trace->trace_pgfaults & TRACE_PFMIN)) { + pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN); + if (pgfault_min == NULL) + goto out_error_mem; + perf_evlist__add(evlist, pgfault_min); + } if (trace->sched && perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime", @@ -2605,7 +2243,45 @@ static int trace__run(struct trace *trace, int argc, const char **argv) goto out_delete_evlist; } - perf_evlist__config(evlist, &trace->opts); + perf_evlist__config(evlist, &trace->opts, NULL); + + if (callchain_param.enabled) { + bool use_identifier = false; + + if (trace->syscalls.events.sys_exit) { + perf_evsel__config_callchain(trace->syscalls.events.sys_exit, + &trace->opts, &callchain_param); + use_identifier = true; + } + + if (pgfault_maj) { + perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param); + use_identifier = true; + } + + if (pgfault_min) { + perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param); + use_identifier = true; + } + + if (use_identifier) { + /* + * Now we have evsels with different sample_ids, use + * PERF_SAMPLE_IDENTIFIER to map from sample to evsel + * from a fixed position in each ring buffer record. + * + * As of this the changeset introducing this comment, this + * isn't strictly needed, as the fields that can come before + * PERF_SAMPLE_ID are all used, but we'll probably disable + * some of those for things like copying the payload of + * pointer syscall arguments, and for vfs_getname we don't + * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this + * here as a warning we need to use PERF_SAMPLE_IDENTIFIER. + */ + perf_evlist__set_sample_bit(evlist, IDENTIFIER); + perf_evlist__reset_sample_bit(evlist, ID); + } + } signal(SIGCHLD, sig_handler); signal(SIGINT, sig_handler); @@ -2883,15 +2559,29 @@ static size_t trace__fprintf_threads_header(FILE *fp) return printed; } +DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs, + struct stats *stats; + double msecs; + int syscall; +) +{ + struct int_node *source = rb_entry(nd, struct int_node, rb_node); + struct stats *stats = source->priv; + + entry->syscall = source->i; + entry->stats = stats; + entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0; +} + static size_t thread__dump_stats(struct thread_trace *ttrace, struct trace *trace, FILE *fp) { - struct stats *stats; size_t printed = 0; struct syscall *sc; - struct int_node *inode = intlist__first(ttrace->syscall_stats); + struct rb_node *nd; + DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats); - if (inode == NULL) + if (syscall_stats == NULL) return 0; printed += fprintf(fp, "\n"); @@ -2900,9 +2590,8 @@ static size_t thread__dump_stats(struct thread_trace *ttrace, printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n"); printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n"); - /* each int_node is a syscall */ - while (inode) { - stats = inode->priv; + resort_rb__for_each(nd, syscall_stats) { + struct stats *stats = syscall_stats_entry->stats; if (stats) { double min = (double)(stats->min) / NSEC_PER_MSEC; double max = (double)(stats->max) / NSEC_PER_MSEC; @@ -2913,34 +2602,23 @@ static size_t thread__dump_stats(struct thread_trace *ttrace, pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0; avg /= NSEC_PER_MSEC; - sc = &trace->syscalls.table[inode->i]; + sc = &trace->syscalls.table[syscall_stats_entry->syscall]; printed += fprintf(fp, " %-15s", sc->name); printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f", - n, avg * n, min, avg); + n, syscall_stats_entry->msecs, min, avg); printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct); } - - inode = intlist__next(inode); } + resort_rb__delete(syscall_stats); printed += fprintf(fp, "\n\n"); return printed; } -/* struct used to pass data to per-thread function */ -struct summary_data { - FILE *fp; - struct trace *trace; - size_t printed; -}; - -static int trace__fprintf_one_thread(struct thread *thread, void *priv) +static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace) { - struct summary_data *data = priv; - FILE *fp = data->fp; - size_t printed = data->printed; - struct trace *trace = data->trace; + size_t printed = 0; struct thread_trace *ttrace = thread__priv(thread); double ratio; @@ -2956,25 +2634,45 @@ static int trace__fprintf_one_thread(struct thread *thread, void *priv) printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj); if (ttrace->pfmin) printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin); - printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms); + if (trace->sched) + printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms); + else if (fputc('\n', fp) != EOF) + ++printed; + printed += thread__dump_stats(ttrace, trace, fp); - data->printed += printed; + return printed; +} - return 0; +static unsigned long thread__nr_events(struct thread_trace *ttrace) +{ + return ttrace ? ttrace->nr_events : 0; +} + +DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)), + struct thread *thread; +) +{ + entry->thread = rb_entry(nd, struct thread, rb_node); } static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp) { - struct summary_data data = { - .fp = fp, - .trace = trace - }; - data.printed = trace__fprintf_threads_header(fp); + DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host); + size_t printed = trace__fprintf_threads_header(fp); + struct rb_node *nd; + + if (threads == NULL) { + fprintf(fp, "%s", "Error sorting output by nr_events!\n"); + return 0; + } - machine__for_each_thread(trace->host, trace__fprintf_one_thread, &data); + resort_rb__for_each(nd, threads) + printed += trace__fprintf_thread(fp, threads_entry->thread, trace); - return data.printed; + resort_rb__delete(threads); + + return printed; } static int trace__set_duration(const struct option *opt, const char *str, @@ -3070,10 +2768,6 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) NULL }; struct trace trace = { - .audit = { - .machine = audit_detect_machine(), - .open_id = audit_name_to_syscall("open", trace.audit.machine), - }, .syscalls = { . max = -1, }, @@ -3091,6 +2785,8 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) .output = stderr, .show_comm = true, .trace_syscalls = true, + .kernel_syscallchains = false, + .max_stack = UINT_MAX, }; const char *output_name = NULL; const char *ev_qualifier_str = NULL; @@ -3136,10 +2832,24 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) "Trace pagefaults", parse_pagefaults, "maj"), OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"), OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"), + OPT_CALLBACK(0, "call-graph", &trace.opts, + "record_mode[,record_size]", record_callchain_help, + &record_parse_callchain_opt), + OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains, + "Show the kernel callchains on the syscall exit path"), + OPT_UINTEGER(0, "min-stack", &trace.min_stack, + "Set the minimum stack depth when parsing the callchain, " + "anything below the specified depth will be ignored."), + OPT_UINTEGER(0, "max-stack", &trace.max_stack, + "Set the maximum stack depth when parsing the callchain, " + "anything beyond the specified depth will be ignored. " + "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)), OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout, "per thread proc mmap processing timeout in ms"), OPT_END() }; + bool __maybe_unused max_stack_user_set = true; + bool mmap_pages_user_set = true; const char * const trace_subcommands[] = { "record", NULL }; int err; char bf[BUFSIZ]; @@ -3148,8 +2858,9 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) signal(SIGFPE, sighandler_dump_stack); trace.evlist = perf_evlist__new(); + trace.sctbl = syscalltbl__new(); - if (trace.evlist == NULL) { + if (trace.evlist == NULL || trace.sctbl == NULL) { pr_err("Not enough memory to run!\n"); err = -ENOMEM; goto out; @@ -3158,11 +2869,40 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands, trace_usage, PARSE_OPT_STOP_AT_NON_OPTION); + err = bpf__setup_stdout(trace.evlist); + if (err) { + bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf)); + pr_err("ERROR: Setup BPF stdout failed: %s\n", bf); + goto out; + } + + err = -1; + if (trace.trace_pgfaults) { trace.opts.sample_address = true; trace.opts.sample_time = true; } + if (trace.opts.mmap_pages == UINT_MAX) + mmap_pages_user_set = false; + + if (trace.max_stack == UINT_MAX) { + trace.max_stack = sysctl_perf_event_max_stack; + max_stack_user_set = false; + } + +#ifdef HAVE_DWARF_UNWIND_SUPPORT + if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) + record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false); +#endif + + if (callchain_param.enabled) { + if (!mmap_pages_user_set && geteuid() == 0) + trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4; + + symbol_conf.use_callchain = true; + } + if (trace.evlist->nr_entries > 0) evlist__set_evsel_handler(trace.evlist, trace__event_handler); @@ -3179,6 +2919,11 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) return -1; } + if (!trace.trace_syscalls && ev_qualifier_str) { + pr_err("The -e option can't be used with --no-syscalls.\n"); + goto out; + } + if (output_name != NULL) { err = trace__open_output(&trace, output_name); if (err < 0) { @@ -3187,6 +2932,8 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) } } + trace.open_id = syscalltbl__id(trace.sctbl, "open"); + if (ev_qualifier_str != NULL) { const char *s = ev_qualifier_str; struct strlist_config slist_config = { diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile index f7d7f5a1cad5..1e46277286c2 100644 --- a/tools/perf/config/Makefile +++ b/tools/perf/config/Makefile @@ -27,7 +27,7 @@ NO_PERF_REGS := 1 ifeq ($(ARCH),x86) $(call detected,CONFIG_X86) ifeq (${IS_64_BIT}, 1) - CFLAGS += -DHAVE_ARCH_X86_64_SUPPORT + CFLAGS += -DHAVE_ARCH_X86_64_SUPPORT -DHAVE_SYSCALL_TABLE -I$(OUTPUT)arch/x86/include/generated ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S ../../arch/x86/lib/memset_64.S LIBUNWIND_LIBS = -lunwind -lunwind-x86_64 $(call detected,CONFIG_X86_64) @@ -268,6 +268,12 @@ else ifneq ($(feature-dwarf), 1) msg := $(warning No libdw.h found or old libdw.h found or elfutils is older than 0.138, disables dwarf support. Please install new elfutils-devel/libdw-dev); NO_DWARF := 1 + else + ifneq ($(feature-dwarf_getlocations), 1) + msg := $(warning Old libdw.h, finding variables at given 'perf probe' point will not work, install elfutils-devel/libdw-dev >= 0.157); + else + CFLAGS += -DHAVE_DWARF_GETLOCATIONS + endif # dwarf_getlocations endif # Dwarf support endif # libelf support endif # NO_LIBELF @@ -289,9 +295,6 @@ ifndef NO_LIBELF CFLAGS += -DHAVE_ELF_GETPHDRNUM_SUPPORT endif - # include ARCH specific config - -include $(src-perf)/arch/$(ARCH)/Makefile - ifndef NO_DWARF ifeq ($(origin PERF_HAVE_DWARF_REGS), undefined) msg := $(warning DWARF register mappings have not been defined for architecture $(ARCH), DWARF support disabled); diff --git a/tools/perf/jvmti/jvmti_agent.c b/tools/perf/jvmti/jvmti_agent.c index 6461e02ab940..3573f315f955 100644 --- a/tools/perf/jvmti/jvmti_agent.c +++ b/tools/perf/jvmti/jvmti_agent.c @@ -92,6 +92,22 @@ error: return ret; } +static int use_arch_timestamp; + +static inline uint64_t +get_arch_timestamp(void) +{ +#if defined(__i386__) || defined(__x86_64__) + unsigned int low, high; + + asm volatile("rdtsc" : "=a" (low), "=d" (high)); + + return low | ((uint64_t)high) << 32; +#else + return 0; +#endif +} + #define NSEC_PER_SEC 1000000000 static int perf_clk_id = CLOCK_MONOTONIC; @@ -107,6 +123,9 @@ perf_get_timestamp(void) struct timespec ts; int ret; + if (use_arch_timestamp) + return get_arch_timestamp(); + ret = clock_gettime(perf_clk_id, &ts); if (ret) return 0; @@ -203,6 +222,17 @@ perf_close_marker_file(void) munmap(marker_addr, pgsz); } +static void +init_arch_timestamp(void) +{ + char *str = getenv("JITDUMP_USE_ARCH_TIMESTAMP"); + + if (!str || !*str || !strcmp(str, "0")) + return; + + use_arch_timestamp = 1; +} + void *jvmti_open(void) { int pad_cnt; @@ -211,11 +241,17 @@ void *jvmti_open(void) int fd; FILE *fp; + init_arch_timestamp(); + /* * check if clockid is supported */ - if (!perf_get_timestamp()) - warnx("jvmti: kernel does not support %d clock id", perf_clk_id); + if (!perf_get_timestamp()) { + if (use_arch_timestamp) + warnx("jvmti: arch timestamp not supported"); + else + warnx("jvmti: kernel does not support %d clock id", perf_clk_id); + } memset(&header, 0, sizeof(header)); @@ -263,6 +299,9 @@ void *jvmti_open(void) header.timestamp = perf_get_timestamp(); + if (use_arch_timestamp) + header.flags |= JITDUMP_FLAGS_ARCH_TIMESTAMP; + if (!fwrite(&header, sizeof(header), 1, fp)) { warn("jvmti: cannot write dumpfile header"); goto error; diff --git a/tools/perf/perf.c b/tools/perf/perf.c index aaee0a782747..797000842d40 100644 --- a/tools/perf/perf.c +++ b/tools/perf/perf.c @@ -17,6 +17,7 @@ #include <subcmd/parse-options.h> #include "util/bpf-loader.h" #include "util/debug.h" +#include <api/fs/fs.h> #include <api/fs/tracing_path.h> #include <pthread.h> #include <stdlib.h> @@ -308,9 +309,11 @@ static int handle_alias(int *argcp, const char ***argv) if (*argcp > 1) { struct strbuf buf; - strbuf_init(&buf, PATH_MAX); - strbuf_addstr(&buf, alias_string); - sq_quote_argv(&buf, (*argv) + 1, PATH_MAX); + if (strbuf_init(&buf, PATH_MAX) < 0 || + strbuf_addstr(&buf, alias_string) < 0 || + sq_quote_argv(&buf, (*argv) + 1, + PATH_MAX) < 0) + die("Failed to allocate memory."); free(alias_string); alias_string = buf.buf; } @@ -533,6 +536,7 @@ int main(int argc, const char **argv) { const char *cmd; char sbuf[STRERR_BUFSIZE]; + int value; /* libsubcmd init */ exec_cmd_init("perf", PREFIX, PERF_EXEC_PATH, EXEC_PATH_ENVIRONMENT); @@ -542,6 +546,9 @@ int main(int argc, const char **argv) page_size = sysconf(_SC_PAGE_SIZE); cacheline_size = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); + if (sysctl__read_int("kernel/perf_event_max_stack", &value) == 0) + sysctl_perf_event_max_stack = value; + cmd = extract_argv0_path(argv[0]); if (!cmd) cmd = "perf-help"; @@ -549,6 +556,7 @@ int main(int argc, const char **argv) srandom(time(NULL)); perf_config(perf_default_config, NULL); + set_buildid_dir(NULL); /* get debugfs/tracefs mount point from /proc/mounts */ tracing_path_mount(); @@ -572,7 +580,6 @@ int main(int argc, const char **argv) } if (!prefixcmp(cmd, "trace")) { #ifdef HAVE_LIBAUDIT_SUPPORT - set_buildid_dir(NULL); setup_path(); argv[0] = "trace"; return cmd_trace(argc, argv, NULL); @@ -587,7 +594,6 @@ int main(int argc, const char **argv) argc--; handle_options(&argv, &argc, NULL); commit_pager_choice(); - set_buildid_dir(NULL); if (argc > 0) { if (!prefixcmp(argv[0], "--")) diff --git a/tools/perf/perf.h b/tools/perf/perf.h index 5381a01c0610..cd8f1b150f9e 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -52,7 +52,6 @@ struct record_opts { bool sample_weight; bool sample_time; bool sample_time_set; - bool callgraph_set; bool period; bool running_time; bool full_auxtrace; diff --git a/tools/perf/scripts/python/export-to-postgresql.py b/tools/perf/scripts/python/export-to-postgresql.py index 1b02cdc0cab6..7656ff8aa066 100644 --- a/tools/perf/scripts/python/export-to-postgresql.py +++ b/tools/perf/scripts/python/export-to-postgresql.py @@ -34,10 +34,9 @@ import datetime # # ubuntu: # -# $ sudo apt-get install postgresql +# $ sudo apt-get install postgresql python-pyside.qtsql libqt4-sql-psql # $ sudo su - postgres -# $ createuser <your user id here> -# Shall the new role be a superuser? (y/n) y +# $ createuser -s <your user id here> # # An example of using this script with Intel PT: # @@ -224,11 +223,14 @@ sys.path.append(os.environ['PERF_EXEC_PATH'] + \ perf_db_export_mode = True perf_db_export_calls = False +perf_db_export_callchains = False + def usage(): - print >> sys.stderr, "Usage is: export-to-postgresql.py <database name> [<columns>] [<calls>]" + print >> sys.stderr, "Usage is: export-to-postgresql.py <database name> [<columns>] [<calls>] [<callchains>]" print >> sys.stderr, "where: columns 'all' or 'branches'" - print >> sys.stderr, " calls 'calls' => create calls table" + print >> sys.stderr, " calls 'calls' => create calls and call_paths table" + print >> sys.stderr, " callchains 'callchains' => create call_paths table" raise Exception("Too few arguments") if (len(sys.argv) < 2): @@ -246,9 +248,11 @@ if columns not in ("all", "branches"): branches = (columns == "branches") -if (len(sys.argv) >= 4): - if (sys.argv[3] == "calls"): +for i in range(3,len(sys.argv)): + if (sys.argv[i] == "calls"): perf_db_export_calls = True + elif (sys.argv[i] == "callchains"): + perf_db_export_callchains = True else: usage() @@ -359,14 +363,16 @@ else: 'transaction bigint,' 'data_src bigint,' 'branch_type integer,' - 'in_tx boolean)') + 'in_tx boolean,' + 'call_path_id bigint)') -if perf_db_export_calls: +if perf_db_export_calls or perf_db_export_callchains: do_query(query, 'CREATE TABLE call_paths (' 'id bigint NOT NULL,' 'parent_id bigint,' 'symbol_id bigint,' 'ip bigint)') +if perf_db_export_calls: do_query(query, 'CREATE TABLE calls (' 'id bigint NOT NULL,' 'thread_id bigint,' @@ -428,7 +434,7 @@ do_query(query, 'CREATE VIEW comm_threads_view AS ' '(SELECT tid FROM threads WHERE id = thread_id) AS tid' ' FROM comm_threads') -if perf_db_export_calls: +if perf_db_export_calls or perf_db_export_callchains: do_query(query, 'CREATE VIEW call_paths_view AS ' 'SELECT ' 'c.id,' @@ -444,6 +450,7 @@ if perf_db_export_calls: '(SELECT dso_id FROM symbols WHERE id = p.symbol_id) AS parent_dso_id,' '(SELECT dso FROM symbols_view WHERE id = p.symbol_id) AS parent_dso_short_name' ' FROM call_paths c INNER JOIN call_paths p ON p.id = c.parent_id') +if perf_db_export_calls: do_query(query, 'CREATE VIEW calls_view AS ' 'SELECT ' 'calls.id,' @@ -541,8 +548,9 @@ dso_file = open_output_file("dso_table.bin") symbol_file = open_output_file("symbol_table.bin") branch_type_file = open_output_file("branch_type_table.bin") sample_file = open_output_file("sample_table.bin") -if perf_db_export_calls: +if perf_db_export_calls or perf_db_export_callchains: call_path_file = open_output_file("call_path_table.bin") +if perf_db_export_calls: call_file = open_output_file("call_table.bin") def trace_begin(): @@ -554,8 +562,8 @@ def trace_begin(): comm_table(0, "unknown") dso_table(0, 0, "unknown", "unknown", "") symbol_table(0, 0, 0, 0, 0, "unknown") - sample_table(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) - if perf_db_export_calls: + sample_table(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) + if perf_db_export_calls or perf_db_export_callchains: call_path_table(0, 0, 0, 0) unhandled_count = 0 @@ -571,8 +579,9 @@ def trace_end(): copy_output_file(symbol_file, "symbols") copy_output_file(branch_type_file, "branch_types") copy_output_file(sample_file, "samples") - if perf_db_export_calls: + if perf_db_export_calls or perf_db_export_callchains: copy_output_file(call_path_file, "call_paths") + if perf_db_export_calls: copy_output_file(call_file, "calls") print datetime.datetime.today(), "Removing intermediate files..." @@ -585,8 +594,9 @@ def trace_end(): remove_output_file(symbol_file) remove_output_file(branch_type_file) remove_output_file(sample_file) - if perf_db_export_calls: + if perf_db_export_calls or perf_db_export_callchains: remove_output_file(call_path_file) + if perf_db_export_calls: remove_output_file(call_file) os.rmdir(output_dir_name) print datetime.datetime.today(), "Adding primary keys" @@ -599,8 +609,9 @@ def trace_end(): do_query(query, 'ALTER TABLE symbols ADD PRIMARY KEY (id)') do_query(query, 'ALTER TABLE branch_types ADD PRIMARY KEY (id)') do_query(query, 'ALTER TABLE samples ADD PRIMARY KEY (id)') - if perf_db_export_calls: + if perf_db_export_calls or perf_db_export_callchains: do_query(query, 'ALTER TABLE call_paths ADD PRIMARY KEY (id)') + if perf_db_export_calls: do_query(query, 'ALTER TABLE calls ADD PRIMARY KEY (id)') print datetime.datetime.today(), "Adding foreign keys" @@ -623,10 +634,11 @@ def trace_end(): 'ADD CONSTRAINT symbolfk FOREIGN KEY (symbol_id) REFERENCES symbols (id),' 'ADD CONSTRAINT todsofk FOREIGN KEY (to_dso_id) REFERENCES dsos (id),' 'ADD CONSTRAINT tosymbolfk FOREIGN KEY (to_symbol_id) REFERENCES symbols (id)') - if perf_db_export_calls: + if perf_db_export_calls or perf_db_export_callchains: do_query(query, 'ALTER TABLE call_paths ' 'ADD CONSTRAINT parentfk FOREIGN KEY (parent_id) REFERENCES call_paths (id),' 'ADD CONSTRAINT symbolfk FOREIGN KEY (symbol_id) REFERENCES symbols (id)') + if perf_db_export_calls: do_query(query, 'ALTER TABLE calls ' 'ADD CONSTRAINT threadfk FOREIGN KEY (thread_id) REFERENCES threads (id),' 'ADD CONSTRAINT commfk FOREIGN KEY (comm_id) REFERENCES comms (id),' @@ -694,11 +706,11 @@ def branch_type_table(branch_type, name, *x): value = struct.pack(fmt, 2, 4, branch_type, n, name) branch_type_file.write(value) -def sample_table(sample_id, evsel_id, machine_id, thread_id, comm_id, dso_id, symbol_id, sym_offset, ip, time, cpu, to_dso_id, to_symbol_id, to_sym_offset, to_ip, period, weight, transaction, data_src, branch_type, in_tx, *x): +def sample_table(sample_id, evsel_id, machine_id, thread_id, comm_id, dso_id, symbol_id, sym_offset, ip, time, cpu, to_dso_id, to_symbol_id, to_sym_offset, to_ip, period, weight, transaction, data_src, branch_type, in_tx, call_path_id, *x): if branches: - value = struct.pack("!hiqiqiqiqiqiqiqiqiqiqiiiqiqiqiqiiiB", 17, 8, sample_id, 8, evsel_id, 8, machine_id, 8, thread_id, 8, comm_id, 8, dso_id, 8, symbol_id, 8, sym_offset, 8, ip, 8, time, 4, cpu, 8, to_dso_id, 8, to_symbol_id, 8, to_sym_offset, 8, to_ip, 4, branch_type, 1, in_tx) + value = struct.pack("!hiqiqiqiqiqiqiqiqiqiqiiiqiqiqiqiiiBiq", 18, 8, sample_id, 8, evsel_id, 8, machine_id, 8, thread_id, 8, comm_id, 8, dso_id, 8, symbol_id, 8, sym_offset, 8, ip, 8, time, 4, cpu, 8, to_dso_id, 8, to_symbol_id, 8, to_sym_offset, 8, to_ip, 4, branch_type, 1, in_tx, 8, call_path_id) else: - value = struct.pack("!hiqiqiqiqiqiqiqiqiqiqiiiqiqiqiqiqiqiqiqiiiB", 21, 8, sample_id, 8, evsel_id, 8, machine_id, 8, thread_id, 8, comm_id, 8, dso_id, 8, symbol_id, 8, sym_offset, 8, ip, 8, time, 4, cpu, 8, to_dso_id, 8, to_symbol_id, 8, to_sym_offset, 8, to_ip, 8, period, 8, weight, 8, transaction, 8, data_src, 4, branch_type, 1, in_tx) + value = struct.pack("!hiqiqiqiqiqiqiqiqiqiqiiiqiqiqiqiqiqiqiqiiiBiq", 22, 8, sample_id, 8, evsel_id, 8, machine_id, 8, thread_id, 8, comm_id, 8, dso_id, 8, symbol_id, 8, sym_offset, 8, ip, 8, time, 4, cpu, 8, to_dso_id, 8, to_symbol_id, 8, to_sym_offset, 8, to_ip, 8, period, 8, weight, 8, transaction, 8, data_src, 4, branch_type, 1, in_tx, 8, call_path_id) sample_file.write(value) def call_path_table(cp_id, parent_id, symbol_id, ip, *x): diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build index 1ba628ed049a..66a28982547b 100644 --- a/tools/perf/tests/Build +++ b/tools/perf/tests/Build @@ -37,6 +37,8 @@ perf-y += topology.o perf-y += cpumap.o perf-y += stat.o perf-y += event_update.o +perf-y += event-times.o +perf-y += backward-ring-buffer.o $(OUTPUT)tests/llvm-src-base.c: tests/bpf-script-example.c tests/Build $(call rule_mkdir) diff --git a/tools/perf/tests/backward-ring-buffer.c b/tools/perf/tests/backward-ring-buffer.c new file mode 100644 index 000000000000..d9ba991a9a30 --- /dev/null +++ b/tools/perf/tests/backward-ring-buffer.c @@ -0,0 +1,151 @@ +/* + * Test backward bit in event attribute, read ring buffer from end to + * beginning + */ + +#include <perf.h> +#include <evlist.h> +#include <sys/prctl.h> +#include "tests.h" +#include "debug.h" + +#define NR_ITERS 111 + +static void testcase(void) +{ + int i; + + for (i = 0; i < NR_ITERS; i++) { + char proc_name[10]; + + snprintf(proc_name, sizeof(proc_name), "p:%d\n", i); + prctl(PR_SET_NAME, proc_name); + } +} + +static int count_samples(struct perf_evlist *evlist, int *sample_count, + int *comm_count) +{ + int i; + + for (i = 0; i < evlist->nr_mmaps; i++) { + union perf_event *event; + + perf_evlist__mmap_read_catchup(evlist, i); + while ((event = perf_evlist__mmap_read_backward(evlist, i)) != NULL) { + const u32 type = event->header.type; + + switch (type) { + case PERF_RECORD_SAMPLE: + (*sample_count)++; + break; + case PERF_RECORD_COMM: + (*comm_count)++; + break; + default: + pr_err("Unexpected record of type %d\n", type); + return TEST_FAIL; + } + } + } + return TEST_OK; +} + +static int do_test(struct perf_evlist *evlist, int mmap_pages, + int *sample_count, int *comm_count) +{ + int err; + char sbuf[STRERR_BUFSIZE]; + + err = perf_evlist__mmap(evlist, mmap_pages, true); + if (err < 0) { + pr_debug("perf_evlist__mmap: %s\n", + strerror_r(errno, sbuf, sizeof(sbuf))); + return TEST_FAIL; + } + + perf_evlist__enable(evlist); + testcase(); + perf_evlist__disable(evlist); + + err = count_samples(evlist, sample_count, comm_count); + perf_evlist__munmap(evlist); + return err; +} + + +int test__backward_ring_buffer(int subtest __maybe_unused) +{ + int ret = TEST_SKIP, err, sample_count = 0, comm_count = 0; + char pid[16], sbuf[STRERR_BUFSIZE]; + struct perf_evlist *evlist; + struct perf_evsel *evsel __maybe_unused; + struct parse_events_error parse_error; + struct record_opts opts = { + .target = { + .uid = UINT_MAX, + .uses_mmap = true, + }, + .freq = 0, + .mmap_pages = 256, + .default_interval = 1, + }; + + snprintf(pid, sizeof(pid), "%d", getpid()); + pid[sizeof(pid) - 1] = '\0'; + opts.target.tid = opts.target.pid = pid; + + evlist = perf_evlist__new(); + if (!evlist) { + pr_debug("No ehough memory to create evlist\n"); + return TEST_FAIL; + } + + err = perf_evlist__create_maps(evlist, &opts.target); + if (err < 0) { + pr_debug("Not enough memory to create thread/cpu maps\n"); + goto out_delete_evlist; + } + + bzero(&parse_error, sizeof(parse_error)); + err = parse_events(evlist, "syscalls:sys_enter_prctl", &parse_error); + if (err) { + pr_debug("Failed to parse tracepoint event, try use root\n"); + ret = TEST_SKIP; + goto out_delete_evlist; + } + + perf_evlist__config(evlist, &opts, NULL); + + /* Set backward bit, ring buffer should be writing from end */ + evlist__for_each(evlist, evsel) + evsel->attr.write_backward = 1; + + err = perf_evlist__open(evlist); + if (err < 0) { + pr_debug("perf_evlist__open: %s\n", + strerror_r(errno, sbuf, sizeof(sbuf))); + goto out_delete_evlist; + } + + ret = TEST_FAIL; + err = do_test(evlist, opts.mmap_pages, &sample_count, + &comm_count); + if (err != TEST_OK) + goto out_delete_evlist; + + if ((sample_count != NR_ITERS) || (comm_count != NR_ITERS)) { + pr_err("Unexpected counter: sample_count=%d, comm_count=%d\n", + sample_count, comm_count); + goto out_delete_evlist; + } + + err = do_test(evlist, 1, &sample_count, &comm_count); + if (err != TEST_OK) + goto out_delete_evlist; + + ret = TEST_OK; +out_delete_evlist: + perf_evlist__delete(evlist); + return ret; +} diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c index 199501c71e27..f31eed31c1a9 100644 --- a/tools/perf/tests/bpf.c +++ b/tools/perf/tests/bpf.c @@ -138,7 +138,7 @@ static int do_test(struct bpf_object *obj, int (*func)(void), perf_evlist__splice_list_tail(evlist, &parse_evlist.list); evlist->nr_groups = parse_evlist.nr_groups; - perf_evlist__config(evlist, &opts); + perf_evlist__config(evlist, &opts, NULL); err = perf_evlist__open(evlist); if (err < 0) { diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index f2b1dcac45d3..0e95c20ecf6e 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -204,6 +204,14 @@ static struct test generic_tests[] = { .func = test__event_update, }, { + .desc = "Test events times", + .func = test__event_times, + }, + { + .desc = "Test backward reading from ring buffer", + .func = test__backward_ring_buffer, + }, + { .func = NULL, }, }; diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index abd3f0ec0c0b..68a69a195545 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -532,7 +532,7 @@ static int do_test_code_reading(bool try_kcore) goto out_put; } - perf_evlist__config(evlist, &opts); + perf_evlist__config(evlist, &opts, NULL); evsel = perf_evlist__first(evlist); diff --git a/tools/perf/tests/dso-data.c b/tools/perf/tests/dso-data.c index dc673ff7c437..8cf0d9e189a8 100644 --- a/tools/perf/tests/dso-data.c +++ b/tools/perf/tests/dso-data.c @@ -202,7 +202,7 @@ static int dsos__create(int cnt, int size) { int i; - dsos = malloc(sizeof(dsos) * cnt); + dsos = malloc(sizeof(*dsos) * cnt); TEST_ASSERT_VAL("failed to alloc dsos array", dsos); for (i = 0; i < cnt; i++) { diff --git a/tools/perf/tests/event-times.c b/tools/perf/tests/event-times.c new file mode 100644 index 000000000000..95fb744f6628 --- /dev/null +++ b/tools/perf/tests/event-times.c @@ -0,0 +1,236 @@ +#include <linux/compiler.h> +#include <string.h> +#include "tests.h" +#include "evlist.h" +#include "evsel.h" +#include "util.h" +#include "debug.h" +#include "thread_map.h" +#include "target.h" + +static int attach__enable_on_exec(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = perf_evlist__last(evlist); + struct target target = { + .uid = UINT_MAX, + }; + const char *argv[] = { "true", NULL, }; + char sbuf[STRERR_BUFSIZE]; + int err; + + pr_debug("attaching to spawned child, enable on exec\n"); + + err = perf_evlist__create_maps(evlist, &target); + if (err < 0) { + pr_debug("Not enough memory to create thread/cpu maps\n"); + return err; + } + + err = perf_evlist__prepare_workload(evlist, &target, argv, false, NULL); + if (err < 0) { + pr_debug("Couldn't run the workload!\n"); + return err; + } + + evsel->attr.enable_on_exec = 1; + + err = perf_evlist__open(evlist); + if (err < 0) { + pr_debug("perf_evlist__open: %s\n", + strerror_r(errno, sbuf, sizeof(sbuf))); + return err; + } + + return perf_evlist__start_workload(evlist) == 1 ? TEST_OK : TEST_FAIL; +} + +static int detach__enable_on_exec(struct perf_evlist *evlist) +{ + waitpid(evlist->workload.pid, NULL, 0); + return 0; +} + +static int attach__current_disabled(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = perf_evlist__last(evlist); + struct thread_map *threads; + int err; + + pr_debug("attaching to current thread as disabled\n"); + + threads = thread_map__new(-1, getpid(), UINT_MAX); + if (threads == NULL) { + pr_debug("thread_map__new\n"); + return -1; + } + + evsel->attr.disabled = 1; + + err = perf_evsel__open_per_thread(evsel, threads); + if (err) { + pr_debug("Failed to open event cpu-clock:u\n"); + return err; + } + + thread_map__put(threads); + return perf_evsel__enable(evsel) == 0 ? TEST_OK : TEST_FAIL; +} + +static int attach__current_enabled(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = perf_evlist__last(evlist); + struct thread_map *threads; + int err; + + pr_debug("attaching to current thread as enabled\n"); + + threads = thread_map__new(-1, getpid(), UINT_MAX); + if (threads == NULL) { + pr_debug("failed to call thread_map__new\n"); + return -1; + } + + err = perf_evsel__open_per_thread(evsel, threads); + + thread_map__put(threads); + return err == 0 ? TEST_OK : TEST_FAIL; +} + +static int detach__disable(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = perf_evlist__last(evlist); + + return perf_evsel__enable(evsel); +} + +static int attach__cpu_disabled(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = perf_evlist__last(evlist); + struct cpu_map *cpus; + int err; + + pr_debug("attaching to CPU 0 as enabled\n"); + + cpus = cpu_map__new("0"); + if (cpus == NULL) { + pr_debug("failed to call cpu_map__new\n"); + return -1; + } + + evsel->attr.disabled = 1; + + err = perf_evsel__open_per_cpu(evsel, cpus); + if (err) { + if (err == -EACCES) + return TEST_SKIP; + + pr_debug("Failed to open event cpu-clock:u\n"); + return err; + } + + cpu_map__put(cpus); + return perf_evsel__enable(evsel); +} + +static int attach__cpu_enabled(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = perf_evlist__last(evlist); + struct cpu_map *cpus; + int err; + + pr_debug("attaching to CPU 0 as enabled\n"); + + cpus = cpu_map__new("0"); + if (cpus == NULL) { + pr_debug("failed to call cpu_map__new\n"); + return -1; + } + + err = perf_evsel__open_per_cpu(evsel, cpus); + if (err == -EACCES) + return TEST_SKIP; + + cpu_map__put(cpus); + return err ? TEST_FAIL : TEST_OK; +} + +static int test_times(int (attach)(struct perf_evlist *), + int (detach)(struct perf_evlist *)) +{ + struct perf_counts_values count; + struct perf_evlist *evlist = NULL; + struct perf_evsel *evsel; + int err = -1, i; + + evlist = perf_evlist__new(); + if (!evlist) { + pr_debug("failed to create event list\n"); + goto out_err; + } + + err = parse_events(evlist, "cpu-clock:u", NULL); + if (err) { + pr_debug("failed to parse event cpu-clock:u\n"); + goto out_err; + } + + evsel = perf_evlist__last(evlist); + evsel->attr.read_format |= + PERF_FORMAT_TOTAL_TIME_ENABLED | + PERF_FORMAT_TOTAL_TIME_RUNNING; + + err = attach(evlist); + if (err == TEST_SKIP) { + pr_debug(" SKIP : not enough rights\n"); + return err; + } + + TEST_ASSERT_VAL("failed to attach", !err); + + for (i = 0; i < 100000000; i++) { } + + TEST_ASSERT_VAL("failed to detach", !detach(evlist)); + + perf_evsel__read(evsel, 0, 0, &count); + + err = !(count.ena == count.run); + + pr_debug(" %s: ena %" PRIu64", run %" PRIu64"\n", + !err ? "OK " : "FAILED", + count.ena, count.run); + +out_err: + if (evlist) + perf_evlist__delete(evlist); + return !err ? TEST_OK : TEST_FAIL; +} + +/* + * This test creates software event 'cpu-clock' + * attaches it in several ways (explained below) + * and checks that enabled and running times + * match. + */ +int test__event_times(int subtest __maybe_unused) +{ + int err, ret = 0; + +#define _T(attach, detach) \ + err = test_times(attach, detach); \ + if (err && (ret == TEST_OK || ret == TEST_SKIP)) \ + ret = err; + + /* attach on newly spawned process after exec */ + _T(attach__enable_on_exec, detach__enable_on_exec) + /* attach on current process as enabled */ + _T(attach__current_enabled, detach__disable) + /* attach on current process as disabled */ + _T(attach__current_disabled, detach__disable) + /* attach on cpu as disabled */ + _T(attach__cpu_disabled, detach__disable) + /* attach on cpu as enabled */ + _T(attach__cpu_enabled, detach__disable) + +#undef _T + return ret; +} diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c index 012eab5d1df1..63ecf21750eb 100644 --- a/tools/perf/tests/event_update.c +++ b/tools/perf/tests/event_update.c @@ -30,7 +30,7 @@ static int process_event_scale(struct perf_tool *tool __maybe_unused, TEST_ASSERT_VAL("wrong id", ev->id == 123); TEST_ASSERT_VAL("wrong id", ev->type == PERF_EVENT_UPDATE__SCALE); - TEST_ASSERT_VAL("wrong scale", ev_data->scale = 0.123); + TEST_ASSERT_VAL("wrong scale", ev_data->scale == 0.123); return 0; } diff --git a/tools/perf/tests/hists_common.c b/tools/perf/tests/hists_common.c index f55f4bd47932..6b21746d6eec 100644 --- a/tools/perf/tests/hists_common.c +++ b/tools/perf/tests/hists_common.c @@ -161,7 +161,7 @@ void print_hists_in(struct hists *hists) struct rb_root *root; struct rb_node *node; - if (sort__need_collapse) + if (hists__has(hists, need_collapse)) root = &hists->entries_collapsed; else root = hists->entries_in; diff --git a/tools/perf/tests/hists_cumulate.c b/tools/perf/tests/hists_cumulate.c index ed5aa9eaeb6c..a9e3db3afac4 100644 --- a/tools/perf/tests/hists_cumulate.c +++ b/tools/perf/tests/hists_cumulate.c @@ -101,7 +101,7 @@ static int add_hist_entries(struct hists *hists, struct machine *machine) if (machine__resolve(machine, &al, &sample) < 0) goto out; - if (hist_entry_iter__add(&iter, &al, PERF_MAX_STACK_DEPTH, + if (hist_entry_iter__add(&iter, &al, sysctl_perf_event_max_stack, NULL) < 0) { addr_location__put(&al); goto out; @@ -126,7 +126,7 @@ static void del_hist_entries(struct hists *hists) struct rb_root *root_out; struct rb_node *node; - if (sort__need_collapse) + if (hists__has(hists, need_collapse)) root_in = &hists->entries_collapsed; else root_in = hists->entries_in; diff --git a/tools/perf/tests/hists_filter.c b/tools/perf/tests/hists_filter.c index b825d24f8186..e846f8c42013 100644 --- a/tools/perf/tests/hists_filter.c +++ b/tools/perf/tests/hists_filter.c @@ -81,7 +81,7 @@ static int add_hist_entries(struct perf_evlist *evlist, al.socket = fake_samples[i].socket; if (hist_entry_iter__add(&iter, &al, - PERF_MAX_STACK_DEPTH, NULL) < 0) { + sysctl_perf_event_max_stack, NULL) < 0) { addr_location__put(&al); goto out; } diff --git a/tools/perf/tests/hists_link.c b/tools/perf/tests/hists_link.c index 358324e47805..acf5a1301c07 100644 --- a/tools/perf/tests/hists_link.c +++ b/tools/perf/tests/hists_link.c @@ -145,7 +145,7 @@ static int __validate_match(struct hists *hists) /* * Only entries from fake_common_samples should have a pair. */ - if (sort__need_collapse) + if (hists__has(hists, need_collapse)) root = &hists->entries_collapsed; else root = hists->entries_in; @@ -197,7 +197,7 @@ static int __validate_link(struct hists *hists, int idx) * and some entries will have no pair. However every entry * in other hists should have (dummy) pair. */ - if (sort__need_collapse) + if (hists__has(hists, need_collapse)) root = &hists->entries_collapsed; else root = hists->entries_in; diff --git a/tools/perf/tests/hists_output.c b/tools/perf/tests/hists_output.c index d3556fbe8c5c..63c5efaba1b5 100644 --- a/tools/perf/tests/hists_output.c +++ b/tools/perf/tests/hists_output.c @@ -67,7 +67,7 @@ static int add_hist_entries(struct hists *hists, struct machine *machine) if (machine__resolve(machine, &al, &sample) < 0) goto out; - if (hist_entry_iter__add(&iter, &al, PERF_MAX_STACK_DEPTH, + if (hist_entry_iter__add(&iter, &al, sysctl_perf_event_max_stack, NULL) < 0) { addr_location__put(&al); goto out; @@ -92,7 +92,7 @@ static void del_hist_entries(struct hists *hists) struct rb_root *root_out; struct rb_node *node; - if (sort__need_collapse) + if (hists__has(hists, need_collapse)) root_in = &hists->entries_collapsed; else root_in = hists->entries_in; diff --git a/tools/perf/tests/keep-tracking.c b/tools/perf/tests/keep-tracking.c index ddb78fae064a..614e45a3c603 100644 --- a/tools/perf/tests/keep-tracking.c +++ b/tools/perf/tests/keep-tracking.c @@ -80,7 +80,7 @@ int test__keep_tracking(int subtest __maybe_unused) CHECK__(parse_events(evlist, "dummy:u", NULL)); CHECK__(parse_events(evlist, "cycles:u", NULL)); - perf_evlist__config(evlist, &opts); + perf_evlist__config(evlist, &opts, NULL); evsel = perf_evlist__first(evlist); diff --git a/tools/perf/tests/openat-syscall-tp-fields.c b/tools/perf/tests/openat-syscall-tp-fields.c index eb99a105f31c..4344fe482c1d 100644 --- a/tools/perf/tests/openat-syscall-tp-fields.c +++ b/tools/perf/tests/openat-syscall-tp-fields.c @@ -44,7 +44,7 @@ int test__syscall_openat_tp_fields(int subtest __maybe_unused) goto out_delete_evlist; } - perf_evsel__config(evsel, &opts); + perf_evsel__config(evsel, &opts, NULL); thread_map__set_pid(evlist->threads, 0, getpid()); diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c index 1cc78cefe399..b836ee6a8d9b 100644 --- a/tools/perf/tests/perf-record.c +++ b/tools/perf/tests/perf-record.c @@ -99,7 +99,7 @@ int test__PERF_RECORD(int subtest __maybe_unused) perf_evsel__set_sample_bit(evsel, CPU); perf_evsel__set_sample_bit(evsel, TID); perf_evsel__set_sample_bit(evsel, TIME); - perf_evlist__config(evlist, &opts); + perf_evlist__config(evlist, &opts, NULL); err = sched__get_first_possible_cpu(evlist->workload.pid, &cpu_mask); if (err < 0) { diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c index ebd80168d51e..39a689bf7574 100644 --- a/tools/perf/tests/switch-tracking.c +++ b/tools/perf/tests/switch-tracking.c @@ -417,7 +417,7 @@ int test__switch_tracking(int subtest __maybe_unused) perf_evsel__set_sample_bit(tracking_evsel, TIME); /* Config events */ - perf_evlist__config(evlist, &opts); + perf_evlist__config(evlist, &opts, NULL); /* Check moved event is still at the front */ if (cycles_evsel != perf_evlist__first(evlist)) { diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h index 82b2b5e6ba7c..c57e72c826d2 100644 --- a/tools/perf/tests/tests.h +++ b/tools/perf/tests/tests.h @@ -85,6 +85,8 @@ int test__synthesize_stat_config(int subtest); int test__synthesize_stat(int subtest); int test__synthesize_stat_round(int subtest); int test__event_update(int subtest); +int test__event_times(int subtest); +int test__backward_ring_buffer(int subtest); #if defined(__arm__) || defined(__aarch64__) #ifdef HAVE_DWARF_UNWIND_SUPPORT diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c index 630b0b409b97..e63abab7d5a1 100644 --- a/tools/perf/tests/vmlinux-kallsyms.c +++ b/tools/perf/tests/vmlinux-kallsyms.c @@ -54,8 +54,14 @@ int test__vmlinux_matches_kallsyms(int subtest __maybe_unused) * Step 3: * * Load and split /proc/kallsyms into multiple maps, one per module. + * Do not use kcore, as this test was designed before kcore support + * and has parts that only make sense if using the non-kcore code. + * XXX: extend it to stress the kcorre code as well, hint: the list + * of modules extracted from /proc/kcore, in its current form, can't + * be compacted against the list of modules found in the "vmlinux" + * code and with the one got from /proc/modules from the "kallsyms" code. */ - if (machine__load_kallsyms(&kallsyms, "/proc/kallsyms", type, NULL) <= 0) { + if (__machine__load_kallsyms(&kallsyms, "/proc/kallsyms", type, true, NULL) <= 0) { pr_debug("dso__load_kallsyms "); goto out; } @@ -157,6 +163,9 @@ next_pair: pr_debug("%#" PRIx64 ": diff name v: %s k: %s\n", mem_start, sym->name, pair->name); + } else { + pr_debug("%#" PRIx64 ": diff name v: %s k: %s\n", + mem_start, sym->name, first_pair->name); } } } else diff --git a/tools/perf/trace/beauty/eventfd.c b/tools/perf/trace/beauty/eventfd.c new file mode 100644 index 000000000000..d64f4a9128a1 --- /dev/null +++ b/tools/perf/trace/beauty/eventfd.c @@ -0,0 +1,38 @@ +#include <sys/eventfd.h> + +#ifndef EFD_SEMAPHORE +#define EFD_SEMAPHORE 1 +#endif + +#ifndef EFD_NONBLOCK +#define EFD_NONBLOCK 00004000 +#endif + +#ifndef EFD_CLOEXEC +#define EFD_CLOEXEC 02000000 +#endif + +static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size, struct syscall_arg *arg) +{ + int printed = 0, flags = arg->val; + + if (flags == 0) + return scnprintf(bf, size, "NONE"); +#define P_FLAG(n) \ + if (flags & EFD_##n) { \ + printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ + flags &= ~EFD_##n; \ + } + + P_FLAG(SEMAPHORE); + P_FLAG(CLOEXEC); + P_FLAG(NONBLOCK); +#undef P_FLAG + + if (flags) + printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); + + return printed; +} + +#define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags diff --git a/tools/perf/trace/beauty/flock.c b/tools/perf/trace/beauty/flock.c new file mode 100644 index 000000000000..021bb48c6336 --- /dev/null +++ b/tools/perf/trace/beauty/flock.c @@ -0,0 +1,31 @@ + +static size_t syscall_arg__scnprintf_flock(char *bf, size_t size, + struct syscall_arg *arg) +{ + int printed = 0, op = arg->val; + + if (op == 0) + return scnprintf(bf, size, "NONE"); +#define P_CMD(cmd) \ + if ((op & LOCK_##cmd) == LOCK_##cmd) { \ + printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \ + op &= ~LOCK_##cmd; \ + } + + P_CMD(SH); + P_CMD(EX); + P_CMD(NB); + P_CMD(UN); + P_CMD(MAND); + P_CMD(RW); + P_CMD(READ); + P_CMD(WRITE); +#undef P_OP + + if (op) + printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op); + + return printed; +} + +#define SCA_FLOCK syscall_arg__scnprintf_flock diff --git a/tools/perf/trace/beauty/futex_op.c b/tools/perf/trace/beauty/futex_op.c new file mode 100644 index 000000000000..e2476211f22d --- /dev/null +++ b/tools/perf/trace/beauty/futex_op.c @@ -0,0 +1,44 @@ +#include <linux/futex.h> + +static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg) +{ + enum syscall_futex_args { + SCF_UADDR = (1 << 0), + SCF_OP = (1 << 1), + SCF_VAL = (1 << 2), + SCF_TIMEOUT = (1 << 3), + SCF_UADDR2 = (1 << 4), + SCF_VAL3 = (1 << 5), + }; + int op = arg->val; + int cmd = op & FUTEX_CMD_MASK; + size_t printed = 0; + + switch (cmd) { +#define P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n); + P_FUTEX_OP(WAIT); arg->mask |= SCF_VAL3|SCF_UADDR2; break; + P_FUTEX_OP(WAKE); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break; + P_FUTEX_OP(FD); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break; + P_FUTEX_OP(REQUEUE); arg->mask |= SCF_VAL3|SCF_TIMEOUT; break; + P_FUTEX_OP(CMP_REQUEUE); arg->mask |= SCF_TIMEOUT; break; + P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT; break; + P_FUTEX_OP(WAKE_OP); break; + P_FUTEX_OP(LOCK_PI); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break; + P_FUTEX_OP(UNLOCK_PI); arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break; + P_FUTEX_OP(TRYLOCK_PI); arg->mask |= SCF_VAL3|SCF_UADDR2; break; + P_FUTEX_OP(WAIT_BITSET); arg->mask |= SCF_UADDR2; break; + P_FUTEX_OP(WAKE_BITSET); arg->mask |= SCF_UADDR2; break; + P_FUTEX_OP(WAIT_REQUEUE_PI); break; + default: printed = scnprintf(bf, size, "%#x", cmd); break; + } + + if (op & FUTEX_PRIVATE_FLAG) + printed += scnprintf(bf + printed, size - printed, "|PRIV"); + + if (op & FUTEX_CLOCK_REALTIME) + printed += scnprintf(bf + printed, size - printed, "|CLKRT"); + + return printed; +} + +#define SCA_FUTEX_OP syscall_arg__scnprintf_futex_op diff --git a/tools/perf/trace/beauty/mmap.c b/tools/perf/trace/beauty/mmap.c new file mode 100644 index 000000000000..3444a4d5382d --- /dev/null +++ b/tools/perf/trace/beauty/mmap.c @@ -0,0 +1,158 @@ +#include <sys/mman.h> + +static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size, + struct syscall_arg *arg) +{ + int printed = 0, prot = arg->val; + + if (prot == PROT_NONE) + return scnprintf(bf, size, "NONE"); +#define P_MMAP_PROT(n) \ + if (prot & PROT_##n) { \ + printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ + prot &= ~PROT_##n; \ + } + + P_MMAP_PROT(EXEC); + P_MMAP_PROT(READ); + P_MMAP_PROT(WRITE); +#ifdef PROT_SEM + P_MMAP_PROT(SEM); +#endif + P_MMAP_PROT(GROWSDOWN); + P_MMAP_PROT(GROWSUP); +#undef P_MMAP_PROT + + if (prot) + printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot); + + return printed; +} + +#define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot + +#ifndef MAP_STACK +# define MAP_STACK 0x20000 +#endif + +static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size, + struct syscall_arg *arg) +{ + int printed = 0, flags = arg->val; + +#define P_MMAP_FLAG(n) \ + if (flags & MAP_##n) { \ + printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ + flags &= ~MAP_##n; \ + } + + P_MMAP_FLAG(SHARED); + P_MMAP_FLAG(PRIVATE); +#ifdef MAP_32BIT + P_MMAP_FLAG(32BIT); +#endif + P_MMAP_FLAG(ANONYMOUS); + P_MMAP_FLAG(DENYWRITE); + P_MMAP_FLAG(EXECUTABLE); + P_MMAP_FLAG(FILE); + P_MMAP_FLAG(FIXED); + P_MMAP_FLAG(GROWSDOWN); +#ifdef MAP_HUGETLB + P_MMAP_FLAG(HUGETLB); +#endif + P_MMAP_FLAG(LOCKED); + P_MMAP_FLAG(NONBLOCK); + P_MMAP_FLAG(NORESERVE); + P_MMAP_FLAG(POPULATE); + P_MMAP_FLAG(STACK); +#ifdef MAP_UNINITIALIZED + P_MMAP_FLAG(UNINITIALIZED); +#endif +#undef P_MMAP_FLAG + + if (flags) + printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); + + return printed; +} + +#define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags + +static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size, + struct syscall_arg *arg) +{ + int printed = 0, flags = arg->val; + +#define P_MREMAP_FLAG(n) \ + if (flags & MREMAP_##n) { \ + printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ + flags &= ~MREMAP_##n; \ + } + + P_MREMAP_FLAG(MAYMOVE); +#ifdef MREMAP_FIXED + P_MREMAP_FLAG(FIXED); +#endif +#undef P_MREMAP_FLAG + + if (flags) + printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); + + return printed; +} + +#define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags + +#ifndef MADV_HWPOISON +#define MADV_HWPOISON 100 +#endif + +#ifndef MADV_MERGEABLE +#define MADV_MERGEABLE 12 +#endif + +#ifndef MADV_UNMERGEABLE +#define MADV_UNMERGEABLE 13 +#endif + +static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size, + struct syscall_arg *arg) +{ + int behavior = arg->val; + + switch (behavior) { +#define P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n) + P_MADV_BHV(NORMAL); + P_MADV_BHV(RANDOM); + P_MADV_BHV(SEQUENTIAL); + P_MADV_BHV(WILLNEED); + P_MADV_BHV(DONTNEED); + P_MADV_BHV(REMOVE); + P_MADV_BHV(DONTFORK); + P_MADV_BHV(DOFORK); + P_MADV_BHV(HWPOISON); +#ifdef MADV_SOFT_OFFLINE + P_MADV_BHV(SOFT_OFFLINE); +#endif + P_MADV_BHV(MERGEABLE); + P_MADV_BHV(UNMERGEABLE); +#ifdef MADV_HUGEPAGE + P_MADV_BHV(HUGEPAGE); +#endif +#ifdef MADV_NOHUGEPAGE + P_MADV_BHV(NOHUGEPAGE); +#endif +#ifdef MADV_DONTDUMP + P_MADV_BHV(DONTDUMP); +#endif +#ifdef MADV_DODUMP + P_MADV_BHV(DODUMP); +#endif +#undef P_MADV_PHV + default: break; + } + + return scnprintf(bf, size, "%#x", behavior); +} + +#define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior diff --git a/tools/perf/trace/beauty/mode_t.c b/tools/perf/trace/beauty/mode_t.c new file mode 100644 index 000000000000..930d8fef2400 --- /dev/null +++ b/tools/perf/trace/beauty/mode_t.c @@ -0,0 +1,68 @@ +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> + +/* From include/linux/stat.h */ +#ifndef S_IRWXUGO +#define S_IRWXUGO (S_IRWXU|S_IRWXG|S_IRWXO) +#endif +#ifndef S_IALLUGO +#define S_IALLUGO (S_ISUID|S_ISGID|S_ISVTX|S_IRWXUGO) +#endif +#ifndef S_IRUGO +#define S_IRUGO (S_IRUSR|S_IRGRP|S_IROTH) +#endif +#ifndef S_IWUGO +#define S_IWUGO (S_IWUSR|S_IWGRP|S_IWOTH) +#endif +#ifndef S_IXUGO +#define S_IXUGO (S_IXUSR|S_IXGRP|S_IXOTH) +#endif + +static size_t syscall_arg__scnprintf_mode_t(char *bf, size_t size, struct syscall_arg *arg) +{ + int printed = 0, mode = arg->val; + +#define P_MODE(n) \ + if ((mode & S_##n) == S_##n) { \ + printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ + mode &= ~S_##n; \ + } + + P_MODE(IALLUGO); + P_MODE(IRWXUGO); + P_MODE(IRUGO); + P_MODE(IWUGO); + P_MODE(IXUGO); + P_MODE(IFMT); + P_MODE(IFSOCK); + P_MODE(IFLNK); + P_MODE(IFREG); + P_MODE(IFBLK); + P_MODE(IFDIR); + P_MODE(IFCHR); + P_MODE(IFIFO); + P_MODE(ISUID); + P_MODE(ISGID); + P_MODE(ISVTX); + P_MODE(IRWXU); + P_MODE(IRUSR); + P_MODE(IWUSR); + P_MODE(IXUSR); + P_MODE(IRWXG); + P_MODE(IRGRP); + P_MODE(IWGRP); + P_MODE(IXGRP); + P_MODE(IRWXO); + P_MODE(IROTH); + P_MODE(IWOTH); + P_MODE(IXOTH); +#undef P_MODE + + if (mode) + printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", mode); + + return printed; +} + +#define SCA_MODE_T syscall_arg__scnprintf_mode_t diff --git a/tools/perf/trace/beauty/msg_flags.c b/tools/perf/trace/beauty/msg_flags.c new file mode 100644 index 000000000000..07fa8a0acad6 --- /dev/null +++ b/tools/perf/trace/beauty/msg_flags.c @@ -0,0 +1,62 @@ +#include <sys/types.h> +#include <sys/socket.h> + +#ifndef MSG_PROBE +#define MSG_PROBE 0x10 +#endif +#ifndef MSG_WAITFORONE +#define MSG_WAITFORONE 0x10000 +#endif +#ifndef MSG_SENDPAGE_NOTLAST +#define MSG_SENDPAGE_NOTLAST 0x20000 +#endif +#ifndef MSG_FASTOPEN +#define MSG_FASTOPEN 0x20000000 +#endif +#ifndef MSG_CMSG_CLOEXEC +# define MSG_CMSG_CLOEXEC 0x40000000 +#endif + +static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size, + struct syscall_arg *arg) +{ + int printed = 0, flags = arg->val; + + if (flags == 0) + return scnprintf(bf, size, "NONE"); +#define P_MSG_FLAG(n) \ + if (flags & MSG_##n) { \ + printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ + flags &= ~MSG_##n; \ + } + + P_MSG_FLAG(OOB); + P_MSG_FLAG(PEEK); + P_MSG_FLAG(DONTROUTE); + P_MSG_FLAG(TRYHARD); + P_MSG_FLAG(CTRUNC); + P_MSG_FLAG(PROBE); + P_MSG_FLAG(TRUNC); + P_MSG_FLAG(DONTWAIT); + P_MSG_FLAG(EOR); + P_MSG_FLAG(WAITALL); + P_MSG_FLAG(FIN); + P_MSG_FLAG(SYN); + P_MSG_FLAG(CONFIRM); + P_MSG_FLAG(RST); + P_MSG_FLAG(ERRQUEUE); + P_MSG_FLAG(NOSIGNAL); + P_MSG_FLAG(MORE); + P_MSG_FLAG(WAITFORONE); + P_MSG_FLAG(SENDPAGE_NOTLAST); + P_MSG_FLAG(FASTOPEN); + P_MSG_FLAG(CMSG_CLOEXEC); +#undef P_MSG_FLAG + + if (flags) + printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); + + return printed; +} + +#define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags diff --git a/tools/perf/trace/beauty/open_flags.c b/tools/perf/trace/beauty/open_flags.c new file mode 100644 index 000000000000..0f3679e0cdcf --- /dev/null +++ b/tools/perf/trace/beauty/open_flags.c @@ -0,0 +1,56 @@ + +static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size, + struct syscall_arg *arg) +{ + int printed = 0, flags = arg->val; + + if (!(flags & O_CREAT)) + arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */ + + if (flags == 0) + return scnprintf(bf, size, "RDONLY"); +#define P_FLAG(n) \ + if (flags & O_##n) { \ + printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ + flags &= ~O_##n; \ + } + + P_FLAG(APPEND); + P_FLAG(ASYNC); + P_FLAG(CLOEXEC); + P_FLAG(CREAT); + P_FLAG(DIRECT); + P_FLAG(DIRECTORY); + P_FLAG(EXCL); + P_FLAG(LARGEFILE); + P_FLAG(NOATIME); + P_FLAG(NOCTTY); +#ifdef O_NONBLOCK + P_FLAG(NONBLOCK); +#elif O_NDELAY + P_FLAG(NDELAY); +#endif +#ifdef O_PATH + P_FLAG(PATH); +#endif + P_FLAG(RDWR); +#ifdef O_DSYNC + if ((flags & O_SYNC) == O_SYNC) + printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC"); + else { + P_FLAG(DSYNC); + } +#else + P_FLAG(SYNC); +#endif + P_FLAG(TRUNC); + P_FLAG(WRONLY); +#undef P_FLAG + + if (flags) + printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); + + return printed; +} + +#define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags diff --git a/tools/perf/trace/beauty/perf_event_open.c b/tools/perf/trace/beauty/perf_event_open.c new file mode 100644 index 000000000000..311f09dd718d --- /dev/null +++ b/tools/perf/trace/beauty/perf_event_open.c @@ -0,0 +1,43 @@ +#ifndef PERF_FLAG_FD_NO_GROUP +# define PERF_FLAG_FD_NO_GROUP (1UL << 0) +#endif + +#ifndef PERF_FLAG_FD_OUTPUT +# define PERF_FLAG_FD_OUTPUT (1UL << 1) +#endif + +#ifndef PERF_FLAG_PID_CGROUP +# define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */ +#endif + +#ifndef PERF_FLAG_FD_CLOEXEC +# define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */ +#endif + +static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size, + struct syscall_arg *arg) +{ + int printed = 0, flags = arg->val; + + if (flags == 0) + return 0; + +#define P_FLAG(n) \ + if (flags & PERF_FLAG_##n) { \ + printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ + flags &= ~PERF_FLAG_##n; \ + } + + P_FLAG(FD_NO_GROUP); + P_FLAG(FD_OUTPUT); + P_FLAG(PID_CGROUP); + P_FLAG(FD_CLOEXEC); +#undef P_FLAG + + if (flags) + printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); + + return printed; +} + +#define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags diff --git a/tools/perf/trace/beauty/pid.c b/tools/perf/trace/beauty/pid.c new file mode 100644 index 000000000000..07486ea65ae3 --- /dev/null +++ b/tools/perf/trace/beauty/pid.c @@ -0,0 +1,21 @@ +static size_t syscall_arg__scnprintf_pid(char *bf, size_t size, struct syscall_arg *arg) +{ + int pid = arg->val; + struct trace *trace = arg->trace; + size_t printed = scnprintf(bf, size, "%d", pid); + struct thread *thread = machine__findnew_thread(trace->host, pid, pid); + + if (thread != NULL) { + if (!thread->comm_set) + thread__set_comm_from_proc(thread); + + if (thread->comm_set) + printed += scnprintf(bf + printed, size - printed, + " (%s)", thread__comm_str(thread)); + thread__put(thread); + } + + return printed; +} + +#define SCA_PID syscall_arg__scnprintf_pid diff --git a/tools/perf/trace/beauty/sched_policy.c b/tools/perf/trace/beauty/sched_policy.c new file mode 100644 index 000000000000..c205bc608b3c --- /dev/null +++ b/tools/perf/trace/beauty/sched_policy.c @@ -0,0 +1,44 @@ +#include <sched.h> + +/* + * Not defined anywhere else, probably, just to make sure we + * catch future flags + */ +#define SCHED_POLICY_MASK 0xff + +#ifndef SCHED_DEADLINE +#define SCHED_DEADLINE 6 +#endif + +static size_t syscall_arg__scnprintf_sched_policy(char *bf, size_t size, + struct syscall_arg *arg) +{ + const char *policies[] = { + "NORMAL", "FIFO", "RR", "BATCH", "ISO", "IDLE", "DEADLINE", + }; + size_t printed; + int policy = arg->val, + flags = policy & ~SCHED_POLICY_MASK; + + policy &= SCHED_POLICY_MASK; + if (policy <= SCHED_DEADLINE) + printed = scnprintf(bf, size, "%s", policies[policy]); + else + printed = scnprintf(bf, size, "%#x", policy); + +#define P_POLICY_FLAG(n) \ + if (flags & SCHED_##n) { \ + printed += scnprintf(bf + printed, size - printed, "|%s", #n); \ + flags &= ~SCHED_##n; \ + } + + P_POLICY_FLAG(RESET_ON_FORK); +#undef P_POLICY_FLAG + + if (flags) + printed += scnprintf(bf + printed, size - printed, "|%#x", flags); + + return printed; +} + +#define SCA_SCHED_POLICY syscall_arg__scnprintf_sched_policy diff --git a/tools/perf/trace/beauty/seccomp.c b/tools/perf/trace/beauty/seccomp.c new file mode 100644 index 000000000000..213c5a7e3e92 --- /dev/null +++ b/tools/perf/trace/beauty/seccomp.c @@ -0,0 +1,52 @@ +#include <linux/seccomp.h> + +#ifndef SECCOMP_SET_MODE_STRICT +#define SECCOMP_SET_MODE_STRICT 0 +#endif +#ifndef SECCOMP_SET_MODE_FILTER +#define SECCOMP_SET_MODE_FILTER 1 +#endif + +static size_t syscall_arg__scnprintf_seccomp_op(char *bf, size_t size, struct syscall_arg *arg) +{ + int op = arg->val; + size_t printed = 0; + + switch (op) { +#define P_SECCOMP_SET_MODE_OP(n) case SECCOMP_SET_MODE_##n: printed = scnprintf(bf, size, #n); break + P_SECCOMP_SET_MODE_OP(STRICT); + P_SECCOMP_SET_MODE_OP(FILTER); +#undef P_SECCOMP_SET_MODE_OP + default: printed = scnprintf(bf, size, "%#x", op); break; + } + + return printed; +} + +#define SCA_SECCOMP_OP syscall_arg__scnprintf_seccomp_op + +#ifndef SECCOMP_FILTER_FLAG_TSYNC +#define SECCOMP_FILTER_FLAG_TSYNC 1 +#endif + +static size_t syscall_arg__scnprintf_seccomp_flags(char *bf, size_t size, + struct syscall_arg *arg) +{ + int printed = 0, flags = arg->val; + +#define P_FLAG(n) \ + if (flags & SECCOMP_FILTER_FLAG_##n) { \ + printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ + flags &= ~SECCOMP_FILTER_FLAG_##n; \ + } + + P_FLAG(TSYNC); +#undef P_FLAG + + if (flags) + printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); + + return printed; +} + +#define SCA_SECCOMP_FLAGS syscall_arg__scnprintf_seccomp_flags diff --git a/tools/perf/trace/beauty/signum.c b/tools/perf/trace/beauty/signum.c new file mode 100644 index 000000000000..d3b0b1fab077 --- /dev/null +++ b/tools/perf/trace/beauty/signum.c @@ -0,0 +1,53 @@ + +static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg) +{ + int sig = arg->val; + + switch (sig) { +#define P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n) + P_SIGNUM(HUP); + P_SIGNUM(INT); + P_SIGNUM(QUIT); + P_SIGNUM(ILL); + P_SIGNUM(TRAP); + P_SIGNUM(ABRT); + P_SIGNUM(BUS); + P_SIGNUM(FPE); + P_SIGNUM(KILL); + P_SIGNUM(USR1); + P_SIGNUM(SEGV); + P_SIGNUM(USR2); + P_SIGNUM(PIPE); + P_SIGNUM(ALRM); + P_SIGNUM(TERM); + P_SIGNUM(CHLD); + P_SIGNUM(CONT); + P_SIGNUM(STOP); + P_SIGNUM(TSTP); + P_SIGNUM(TTIN); + P_SIGNUM(TTOU); + P_SIGNUM(URG); + P_SIGNUM(XCPU); + P_SIGNUM(XFSZ); + P_SIGNUM(VTALRM); + P_SIGNUM(PROF); + P_SIGNUM(WINCH); + P_SIGNUM(IO); + P_SIGNUM(PWR); + P_SIGNUM(SYS); +#ifdef SIGEMT + P_SIGNUM(EMT); +#endif +#ifdef SIGSTKFLT + P_SIGNUM(STKFLT); +#endif +#ifdef SIGSWI + P_SIGNUM(SWI); +#endif + default: break; + } + + return scnprintf(bf, size, "%#x", sig); +} + +#define SCA_SIGNUM syscall_arg__scnprintf_signum diff --git a/tools/perf/trace/beauty/socket_type.c b/tools/perf/trace/beauty/socket_type.c new file mode 100644 index 000000000000..0a5ce818131c --- /dev/null +++ b/tools/perf/trace/beauty/socket_type.c @@ -0,0 +1,60 @@ +#include <sys/types.h> +#include <sys/socket.h> + +#ifndef SOCK_DCCP +# define SOCK_DCCP 6 +#endif + +#ifndef SOCK_CLOEXEC +# define SOCK_CLOEXEC 02000000 +#endif + +#ifndef SOCK_NONBLOCK +# define SOCK_NONBLOCK 00004000 +#endif + +#ifndef SOCK_TYPE_MASK +#define SOCK_TYPE_MASK 0xf +#endif + +static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size, struct syscall_arg *arg) +{ + size_t printed; + int type = arg->val, + flags = type & ~SOCK_TYPE_MASK; + + type &= SOCK_TYPE_MASK; + /* + * Can't use a strarray, MIPS may override for ABI reasons. + */ + switch (type) { +#define P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break; + P_SK_TYPE(STREAM); + P_SK_TYPE(DGRAM); + P_SK_TYPE(RAW); + P_SK_TYPE(RDM); + P_SK_TYPE(SEQPACKET); + P_SK_TYPE(DCCP); + P_SK_TYPE(PACKET); +#undef P_SK_TYPE + default: + printed = scnprintf(bf, size, "%#x", type); + } + +#define P_SK_FLAG(n) \ + if (flags & SOCK_##n) { \ + printed += scnprintf(bf + printed, size - printed, "|%s", #n); \ + flags &= ~SOCK_##n; \ + } + + P_SK_FLAG(CLOEXEC); + P_SK_FLAG(NONBLOCK); +#undef P_SK_FLAG + + if (flags) + printed += scnprintf(bf + printed, size - printed, "|%#x", flags); + + return printed; +} + +#define SCA_SK_TYPE syscall_arg__scnprintf_socket_type diff --git a/tools/perf/trace/beauty/waitid_options.c b/tools/perf/trace/beauty/waitid_options.c new file mode 100644 index 000000000000..7942724adec8 --- /dev/null +++ b/tools/perf/trace/beauty/waitid_options.c @@ -0,0 +1,26 @@ +#include <sys/types.h> +#include <sys/wait.h> + +static size_t syscall_arg__scnprintf_waitid_options(char *bf, size_t size, + struct syscall_arg *arg) +{ + int printed = 0, options = arg->val; + +#define P_OPTION(n) \ + if (options & W##n) { \ + printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ + options &= ~W##n; \ + } + + P_OPTION(NOHANG); + P_OPTION(UNTRACED); + P_OPTION(CONTINUED); +#undef P_OPTION + + if (options) + printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", options); + + return printed; +} + +#define SCA_WAITID_OPTIONS syscall_arg__scnprintf_waitid_options diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index 2a83414159a6..538bae880bfe 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -1607,9 +1607,8 @@ static int hists_browser__scnprintf_hierarchy_headers(struct hist_browser *brows ret = fmt->header(fmt, &dummy_hpp, hists_to_evsel(hists)); dummy_hpp.buf[ret] = '\0'; - rtrim(dummy_hpp.buf); - start = ltrim(dummy_hpp.buf); + start = trim(dummy_hpp.buf); ret = strlen(start); if (start != dummy_hpp.buf) @@ -1897,11 +1896,10 @@ static int hist_browser__fprintf_entry(struct hist_browser *browser, bool first = true; int ret; - if (symbol_conf.use_callchain) + if (symbol_conf.use_callchain) { folded_sign = hist_entry__folded(he); - - if (symbol_conf.use_callchain) printed += fprintf(fp, "%c ", folded_sign); + } hists__for_each_format(browser->hists, fmt) { if (perf_hpp__should_skip(fmt, he->hists)) @@ -2137,7 +2135,7 @@ static int hists__browser_title(struct hists *hists, printed += snprintf(bf + printed, size - printed, ", UID: %s", hists->uid_filter_str); if (thread) { - if (sort__has_thread) { + if (hists__has(hists, thread)) { printed += scnprintf(bf + printed, size - printed, ", Thread: %s(%d)", (thread->comm_set ? thread__comm_str(thread) : ""), @@ -2322,7 +2320,8 @@ do_zoom_thread(struct hist_browser *browser, struct popup_action *act) { struct thread *thread = act->thread; - if ((!sort__has_thread && !sort__has_comm) || thread == NULL) + if ((!hists__has(browser->hists, thread) && + !hists__has(browser->hists, comm)) || thread == NULL) return 0; if (browser->hists->thread_filter) { @@ -2331,7 +2330,7 @@ do_zoom_thread(struct hist_browser *browser, struct popup_action *act) thread__zput(browser->hists->thread_filter); ui_helpline__pop(); } else { - if (sort__has_thread) { + if (hists__has(browser->hists, thread)) { ui_helpline__fpush("To zoom out press ESC or ENTER + \"Zoom out of %s(%d) thread\"", thread->comm_set ? thread__comm_str(thread) : "", thread->tid); @@ -2356,10 +2355,11 @@ add_thread_opt(struct hist_browser *browser, struct popup_action *act, { int ret; - if ((!sort__has_thread && !sort__has_comm) || thread == NULL) + if ((!hists__has(browser->hists, thread) && + !hists__has(browser->hists, comm)) || thread == NULL) return 0; - if (sort__has_thread) { + if (hists__has(browser->hists, thread)) { ret = asprintf(optstr, "Zoom %s %s(%d) thread", browser->hists->thread_filter ? "out of" : "into", thread->comm_set ? thread__comm_str(thread) : "", @@ -2382,7 +2382,7 @@ do_zoom_dso(struct hist_browser *browser, struct popup_action *act) { struct map *map = act->ms.map; - if (!sort__has_dso || map == NULL) + if (!hists__has(browser->hists, dso) || map == NULL) return 0; if (browser->hists->dso_filter) { @@ -2409,7 +2409,7 @@ static int add_dso_opt(struct hist_browser *browser, struct popup_action *act, char **optstr, struct map *map) { - if (!sort__has_dso || map == NULL) + if (!hists__has(browser->hists, dso) || map == NULL) return 0; if (asprintf(optstr, "Zoom %s %s DSO", @@ -2431,10 +2431,10 @@ do_browse_map(struct hist_browser *browser __maybe_unused, } static int -add_map_opt(struct hist_browser *browser __maybe_unused, +add_map_opt(struct hist_browser *browser, struct popup_action *act, char **optstr, struct map *map) { - if (!sort__has_dso || map == NULL) + if (!hists__has(browser->hists, dso) || map == NULL) return 0; if (asprintf(optstr, "Browse map details") < 0) @@ -2536,7 +2536,7 @@ add_exit_opt(struct hist_browser *browser __maybe_unused, static int do_zoom_socket(struct hist_browser *browser, struct popup_action *act) { - if (!sort__has_socket || act->socket < 0) + if (!hists__has(browser->hists, socket) || act->socket < 0) return 0; if (browser->hists->socket_filter > -1) { @@ -2558,7 +2558,7 @@ static int add_socket_opt(struct hist_browser *browser, struct popup_action *act, char **optstr, int socket_id) { - if (!sort__has_socket || socket_id < 0) + if (!hists__has(browser->hists, socket) || socket_id < 0) return 0; if (asprintf(optstr, "Zoom %s Processor Socket %d", @@ -2749,7 +2749,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events, */ goto out_free_stack; case 'a': - if (!sort__has_sym) { + if (!hists__has(hists, sym)) { ui_browser__warning(&browser->b, delay_secs * 2, "Annotation is only available for symbolic views, " "include \"sym*\" in --sort to use it."); @@ -2912,7 +2912,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events, continue; } - if (!sort__has_sym || browser->selection == NULL) + if (!hists__has(hists, sym) || browser->selection == NULL) goto skip_annotation; if (sort__mode == SORT_MODE__BRANCH) { @@ -2956,7 +2956,7 @@ skip_annotation: goto skip_scripting; if (browser->he_selection) { - if (sort__has_thread && thread) { + if (hists__has(hists, thread) && thread) { nr_options += add_script_opt(browser, &actions[nr_options], &options[nr_options], @@ -2971,7 +2971,7 @@ skip_annotation: * * See hist_browser__show_entry. */ - if (sort__has_sym && browser->selection->sym) { + if (hists__has(hists, sym) && browser->selection->sym) { nr_options += add_script_opt(browser, &actions[nr_options], &options[nr_options], diff --git a/tools/perf/ui/gtk/hists.c b/tools/perf/ui/gtk/hists.c index 2aa45b606fa4..932adfaa05af 100644 --- a/tools/perf/ui/gtk/hists.c +++ b/tools/perf/ui/gtk/hists.c @@ -379,7 +379,7 @@ static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists, gtk_tree_store_set(store, &iter, col_idx++, s, -1); } - if (symbol_conf.use_callchain && sort__has_sym) { + if (symbol_conf.use_callchain && hists__has(hists, sym)) { if (callchain_param.mode == CHAIN_GRAPH_REL) total = symbol_conf.cumulate_callchain ? h->stat_acc->period : h->stat.period; diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c index 3baeaa6e71b5..af07ffb129ca 100644 --- a/tools/perf/ui/hist.c +++ b/tools/perf/ui/hist.c @@ -635,7 +635,7 @@ unsigned int hists__sort_list_width(struct hists *hists) ret += fmt->width(fmt, &dummy_hpp, hists_to_evsel(hists)); } - if (verbose && sort__has_sym) /* Addr + origin */ + if (verbose && hists__has(hists, sym)) /* Addr + origin */ ret += 3 + BITS_PER_LONG / 4; return ret; diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c index 7aff5acf3265..560eb47d56f9 100644 --- a/tools/perf/ui/stdio/hist.c +++ b/tools/perf/ui/stdio/hist.c @@ -569,9 +569,8 @@ static int print_hierarchy_header(struct hists *hists, struct perf_hpp *hpp, first_col = false; fmt->header(fmt, hpp, hists_to_evsel(hists)); - rtrim(hpp->buf); - header_width += fprintf(fp, "%s", ltrim(hpp->buf)); + header_width += fprintf(fp, "%s", trim(hpp->buf)); } } diff --git a/tools/perf/util/Build b/tools/perf/util/Build index da48fd843438..8c6c8a0ca642 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -8,6 +8,7 @@ libperf-y += env.o libperf-y += event.o libperf-y += evlist.o libperf-y += evsel.o +libperf-y += evsel_fprintf.o libperf-y += find_bit.o libperf-y += kallsyms.o libperf-y += levenshtein.o @@ -26,9 +27,9 @@ libperf-y += strlist.o libperf-y += strfilter.o libperf-y += top.o libperf-y += usage.o -libperf-y += wrapper.o libperf-y += dso.o libperf-y += symbol.o +libperf-y += symbol_fprintf.o libperf-y += color.o libperf-y += header.o libperf-y += callchain.o @@ -38,6 +39,7 @@ libperf-y += machine.o libperf-y += map.o libperf-y += pstack.o libperf-y += session.o +libperf-$(CONFIG_AUDIT) += syscalltbl.o libperf-y += ordered-events.o libperf-y += comm.o libperf-y += thread.o @@ -69,9 +71,9 @@ libperf-y += stat-shadow.o libperf-y += record.o libperf-y += srcline.o libperf-y += data.o -libperf-$(CONFIG_X86) += tsc.o -libperf-$(CONFIG_AUXTRACE) += tsc.o +libperf-y += tsc.o libperf-y += cloexec.o +libperf-y += call-path.o libperf-y += thread-stack.o libperf-$(CONFIG_AUXTRACE) += auxtrace.o libperf-$(CONFIG_AUXTRACE) += intel-pt-decoder/ diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index b795b6994144..4db73d5a0dbc 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1138,7 +1138,7 @@ fallback: if (dso->symtab_type == DSO_BINARY_TYPE__KALLSYMS && !dso__is_kcore(dso)) { - char bf[BUILD_ID_SIZE * 2 + 16] = " with build id "; + char bf[SBUILD_ID_SIZE + 15] = " with build id "; char *build_id_msg = NULL; if (dso->annotate_warned) @@ -1665,5 +1665,5 @@ int hist_entry__annotate(struct hist_entry *he, size_t privsize) bool ui__has_annotation(void) { - return use_browser == 1 && sort__has_sym; + return use_browser == 1 && perf_hpp_list.sym; } diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index ec164fe70718..c9169011e55e 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -940,6 +940,7 @@ void itrace_synth_opts__set_default(struct itrace_synth_opts *synth_opts) synth_opts->period = PERF_ITRACE_DEFAULT_PERIOD; synth_opts->callchain_sz = PERF_ITRACE_DEFAULT_CALLCHAIN_SZ; synth_opts->last_branch_sz = PERF_ITRACE_DEFAULT_LAST_BRANCH_SZ; + synth_opts->initial_skip = 0; } /* @@ -1064,6 +1065,12 @@ int itrace_parse_synth_opts(const struct option *opt, const char *str, synth_opts->last_branch_sz = val; } break; + case 's': + synth_opts->initial_skip = strtoul(p, &endptr, 10); + if (p == endptr) + goto out_err; + p = endptr; + break; case ' ': case ',': break; diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h index 57ff31ecb8e4..767989e0e312 100644 --- a/tools/perf/util/auxtrace.h +++ b/tools/perf/util/auxtrace.h @@ -68,6 +68,7 @@ enum itrace_period_type { * @last_branch_sz: branch context size * @period: 'instructions' events period * @period_type: 'instructions' events period type + * @initial_skip: skip N events at the beginning. */ struct itrace_synth_opts { bool set; @@ -86,6 +87,7 @@ struct itrace_synth_opts { unsigned int last_branch_sz; unsigned long long period; enum itrace_period_type period_type; + unsigned long initial_skip; }; /** diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c index 0967ce601931..493307d1414c 100644 --- a/tools/perf/util/bpf-loader.c +++ b/tools/perf/util/bpf-loader.c @@ -842,6 +842,58 @@ bpf_map_op__new(struct parse_events_term *term) return op; } +static struct bpf_map_op * +bpf_map_op__clone(struct bpf_map_op *op) +{ + struct bpf_map_op *newop; + + newop = memdup(op, sizeof(*op)); + if (!newop) { + pr_debug("Failed to alloc bpf_map_op\n"); + return NULL; + } + + INIT_LIST_HEAD(&newop->list); + if (op->key_type == BPF_MAP_KEY_RANGES) { + size_t memsz = op->k.array.nr_ranges * + sizeof(op->k.array.ranges[0]); + + newop->k.array.ranges = memdup(op->k.array.ranges, memsz); + if (!newop->k.array.ranges) { + pr_debug("Failed to alloc indices for map\n"); + free(newop); + return NULL; + } + } + + return newop; +} + +static struct bpf_map_priv * +bpf_map_priv__clone(struct bpf_map_priv *priv) +{ + struct bpf_map_priv *newpriv; + struct bpf_map_op *pos, *newop; + + newpriv = zalloc(sizeof(*newpriv)); + if (!newpriv) { + pr_debug("No enough memory to alloc map private\n"); + return NULL; + } + INIT_LIST_HEAD(&newpriv->ops_list); + + list_for_each_entry(pos, &priv->ops_list, list) { + newop = bpf_map_op__clone(pos); + if (!newop) { + bpf_map_priv__purge(newpriv); + return NULL; + } + list_add_tail(&newop->list, &newpriv->ops_list); + } + + return newpriv; +} + static int bpf_map__add_op(struct bpf_map *map, struct bpf_map_op *op) { @@ -1417,6 +1469,89 @@ int bpf__apply_obj_config(void) return 0; } +#define bpf__for_each_map(pos, obj, objtmp) \ + bpf_object__for_each_safe(obj, objtmp) \ + bpf_map__for_each(pos, obj) + +#define bpf__for_each_stdout_map(pos, obj, objtmp) \ + bpf__for_each_map(pos, obj, objtmp) \ + if (bpf_map__get_name(pos) && \ + (strcmp("__bpf_stdout__", \ + bpf_map__get_name(pos)) == 0)) + +int bpf__setup_stdout(struct perf_evlist *evlist __maybe_unused) +{ + struct bpf_map_priv *tmpl_priv = NULL; + struct bpf_object *obj, *tmp; + struct perf_evsel *evsel = NULL; + struct bpf_map *map; + int err; + bool need_init = false; + + bpf__for_each_stdout_map(map, obj, tmp) { + struct bpf_map_priv *priv; + + err = bpf_map__get_private(map, (void **)&priv); + if (err) + return -BPF_LOADER_ERRNO__INTERNAL; + + /* + * No need to check map type: type should have been + * verified by kernel. + */ + if (!need_init && !priv) + need_init = !priv; + if (!tmpl_priv && priv) + tmpl_priv = priv; + } + + if (!need_init) + return 0; + + if (!tmpl_priv) { + err = parse_events(evlist, "bpf-output/no-inherit=1,name=__bpf_stdout__/", + NULL); + if (err) { + pr_debug("ERROR: failed to create bpf-output event\n"); + return -err; + } + + evsel = perf_evlist__last(evlist); + } + + bpf__for_each_stdout_map(map, obj, tmp) { + struct bpf_map_priv *priv; + + err = bpf_map__get_private(map, (void **)&priv); + if (err) + return -BPF_LOADER_ERRNO__INTERNAL; + if (priv) + continue; + + if (tmpl_priv) { + priv = bpf_map_priv__clone(tmpl_priv); + if (!priv) + return -ENOMEM; + + err = bpf_map__set_private(map, priv, bpf_map_priv__clear); + if (err) { + bpf_map_priv__clear(map, priv); + return err; + } + } else if (evsel) { + struct bpf_map_op *op; + + op = bpf_map__add_newop(map, NULL); + if (IS_ERR(op)) + return PTR_ERR(op); + op->op_type = BPF_MAP_OP_SET_EVSEL; + op->v.evsel = evsel; + } + } + + return 0; +} + #define ERRNO_OFFSET(e) ((e) - __BPF_LOADER_ERRNO__START) #define ERRCODE_OFFSET(c) ERRNO_OFFSET(BPF_LOADER_ERRNO__##c) #define NR_ERRNO (__BPF_LOADER_ERRNO__END - __BPF_LOADER_ERRNO__START) @@ -1590,3 +1725,11 @@ int bpf__strerror_apply_obj_config(int err, char *buf, size_t size) bpf__strerror_end(buf, size); return 0; } + +int bpf__strerror_setup_stdout(struct perf_evlist *evlist __maybe_unused, + int err, char *buf, size_t size) +{ + bpf__strerror_head(err, buf, size); + bpf__strerror_end(buf, size); + return 0; +} diff --git a/tools/perf/util/bpf-loader.h b/tools/perf/util/bpf-loader.h index be4311944e3d..941e17275aa7 100644 --- a/tools/perf/util/bpf-loader.h +++ b/tools/perf/util/bpf-loader.h @@ -79,6 +79,11 @@ int bpf__strerror_config_obj(struct bpf_object *obj, size_t size); int bpf__apply_obj_config(void); int bpf__strerror_apply_obj_config(int err, char *buf, size_t size); + +int bpf__setup_stdout(struct perf_evlist *evlist); +int bpf__strerror_setup_stdout(struct perf_evlist *evlist, int err, + char *buf, size_t size); + #else static inline struct bpf_object * bpf__prepare_load(const char *filename __maybe_unused, @@ -125,6 +130,12 @@ bpf__apply_obj_config(void) } static inline int +bpf__setup_stdout(struct perf_evlist *evlist __maybe_unused) +{ + return 0; +} + +static inline int __bpf_strerror(char *buf, size_t size) { if (!size) @@ -177,5 +188,13 @@ bpf__strerror_apply_obj_config(int err __maybe_unused, { return __bpf_strerror(buf, size); } + +static inline int +bpf__strerror_setup_stdout(struct perf_evlist *evlist __maybe_unused, + int err __maybe_unused, char *buf, + size_t size) +{ + return __bpf_strerror(buf, size); +} #endif #endif diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index 0573c2ec861d..bff425e1232c 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -261,14 +261,14 @@ static int machine__write_buildid_table(struct machine *machine, int fd) if (dso__is_vdso(pos)) { name = pos->short_name; - name_len = pos->short_name_len + 1; + name_len = pos->short_name_len; } else if (dso__is_kcore(pos)) { machine__mmap_name(machine, nm, sizeof(nm)); name = nm; - name_len = strlen(nm) + 1; + name_len = strlen(nm); } else { name = pos->long_name; - name_len = pos->long_name_len + 1; + name_len = pos->long_name_len; } in_kernel = pos->kernel || @@ -365,39 +365,17 @@ static char *build_id_cache__dirname_from_path(const char *name, int build_id_cache__list_build_ids(const char *pathname, struct strlist **result) { - struct strlist *list; char *dir_name; - DIR *dir; - struct dirent *d; int ret = 0; - list = strlist__new(NULL, NULL); dir_name = build_id_cache__dirname_from_path(pathname, false, false); - if (!list || !dir_name) { - ret = -ENOMEM; - goto out; - } + if (!dir_name) + return -ENOMEM; - /* List up all dirents */ - dir = opendir(dir_name); - if (!dir) { + *result = lsdir(dir_name, lsdir_no_dot_filter); + if (!*result) ret = -errno; - goto out; - } - - while ((d = readdir(dir)) != NULL) { - if (!strcmp(d->d_name, ".") || !strcmp(d->d_name, "..")) - continue; - strlist__add(list, d->d_name); - } - closedir(dir); - -out: free(dir_name); - if (ret) - strlist__delete(list); - else - *result = list; return ret; } diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h index 1f5a93c2c9a2..0d814bb74661 100644 --- a/tools/perf/util/cache.h +++ b/tools/perf/util/cache.h @@ -40,25 +40,6 @@ int split_cmdline(char *cmdline, const char ***argv); #define alloc_nr(x) (((x)+16)*3/2) -/* - * Realloc the buffer pointed at by variable 'x' so that it can hold - * at least 'nr' entries; the number of entries currently allocated - * is 'alloc', using the standard growing factor alloc_nr() macro. - * - * DO NOT USE any expression with side-effect for 'x' or 'alloc'. - */ -#define ALLOC_GROW(x, nr, alloc) \ - do { \ - if ((nr) > alloc) { \ - if (alloc_nr(alloc) < (nr)) \ - alloc = (nr); \ - else \ - alloc = alloc_nr(alloc); \ - x = xrealloc((x), alloc * sizeof(*(x))); \ - } \ - } while(0) - - static inline int is_absolute_path(const char *path) { return path[0] == '/'; diff --git a/tools/perf/util/call-path.c b/tools/perf/util/call-path.c new file mode 100644 index 000000000000..904a17052e38 --- /dev/null +++ b/tools/perf/util/call-path.c @@ -0,0 +1,122 @@ +/* + * call-path.h: Manipulate a tree data structure containing function call paths + * Copyright (c) 2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#include <linux/rbtree.h> +#include <linux/list.h> + +#include "util.h" +#include "call-path.h" + +static void call_path__init(struct call_path *cp, struct call_path *parent, + struct symbol *sym, u64 ip, bool in_kernel) +{ + cp->parent = parent; + cp->sym = sym; + cp->ip = sym ? 0 : ip; + cp->db_id = 0; + cp->in_kernel = in_kernel; + RB_CLEAR_NODE(&cp->rb_node); + cp->children = RB_ROOT; +} + +struct call_path_root *call_path_root__new(void) +{ + struct call_path_root *cpr; + + cpr = zalloc(sizeof(struct call_path_root)); + if (!cpr) + return NULL; + call_path__init(&cpr->call_path, NULL, NULL, 0, false); + INIT_LIST_HEAD(&cpr->blocks); + return cpr; +} + +void call_path_root__free(struct call_path_root *cpr) +{ + struct call_path_block *pos, *n; + + list_for_each_entry_safe(pos, n, &cpr->blocks, node) { + list_del(&pos->node); + free(pos); + } + free(cpr); +} + +static struct call_path *call_path__new(struct call_path_root *cpr, + struct call_path *parent, + struct symbol *sym, u64 ip, + bool in_kernel) +{ + struct call_path_block *cpb; + struct call_path *cp; + size_t n; + + if (cpr->next < cpr->sz) { + cpb = list_last_entry(&cpr->blocks, struct call_path_block, + node); + } else { + cpb = zalloc(sizeof(struct call_path_block)); + if (!cpb) + return NULL; + list_add_tail(&cpb->node, &cpr->blocks); + cpr->sz += CALL_PATH_BLOCK_SIZE; + } + + n = cpr->next++ & CALL_PATH_BLOCK_MASK; + cp = &cpb->cp[n]; + + call_path__init(cp, parent, sym, ip, in_kernel); + + return cp; +} + +struct call_path *call_path__findnew(struct call_path_root *cpr, + struct call_path *parent, + struct symbol *sym, u64 ip, u64 ks) +{ + struct rb_node **p; + struct rb_node *node_parent = NULL; + struct call_path *cp; + bool in_kernel = ip >= ks; + + if (sym) + ip = 0; + + if (!parent) + return call_path__new(cpr, parent, sym, ip, in_kernel); + + p = &parent->children.rb_node; + while (*p != NULL) { + node_parent = *p; + cp = rb_entry(node_parent, struct call_path, rb_node); + + if (cp->sym == sym && cp->ip == ip) + return cp; + + if (sym < cp->sym || (sym == cp->sym && ip < cp->ip)) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + cp = call_path__new(cpr, parent, sym, ip, in_kernel); + if (!cp) + return NULL; + + rb_link_node(&cp->rb_node, node_parent, p); + rb_insert_color(&cp->rb_node, &parent->children); + + return cp; +} diff --git a/tools/perf/util/call-path.h b/tools/perf/util/call-path.h new file mode 100644 index 000000000000..477f6d03b659 --- /dev/null +++ b/tools/perf/util/call-path.h @@ -0,0 +1,77 @@ +/* + * call-path.h: Manipulate a tree data structure containing function call paths + * Copyright (c) 2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#ifndef __PERF_CALL_PATH_H +#define __PERF_CALL_PATH_H + +#include <sys/types.h> + +#include <linux/types.h> +#include <linux/rbtree.h> + +/** + * struct call_path - node in list of calls leading to a function call. + * @parent: call path to the parent function call + * @sym: symbol of function called + * @ip: only if sym is null, the ip of the function + * @db_id: id used for db-export + * @in_kernel: whether function is a in the kernel + * @rb_node: node in parent's tree of called functions + * @children: tree of call paths of functions called + * + * In combination with the call_return structure, the call_path structure + * defines a context-sensitve call-graph. + */ +struct call_path { + struct call_path *parent; + struct symbol *sym; + u64 ip; + u64 db_id; + bool in_kernel; + struct rb_node rb_node; + struct rb_root children; +}; + +#define CALL_PATH_BLOCK_SHIFT 8 +#define CALL_PATH_BLOCK_SIZE (1 << CALL_PATH_BLOCK_SHIFT) +#define CALL_PATH_BLOCK_MASK (CALL_PATH_BLOCK_SIZE - 1) + +struct call_path_block { + struct call_path cp[CALL_PATH_BLOCK_SIZE]; + struct list_head node; +}; + +/** + * struct call_path_root - root of all call paths. + * @call_path: root call path + * @blocks: list of blocks to store call paths + * @next: next free space + * @sz: number of spaces + */ +struct call_path_root { + struct call_path call_path; + struct list_head blocks; + size_t next; + size_t sz; +}; + +struct call_path_root *call_path_root__new(void); +void call_path_root__free(struct call_path_root *cpr); + +struct call_path *call_path__findnew(struct call_path_root *cpr, + struct call_path *parent, + struct symbol *sym, u64 ip, u64 ks); + +#endif diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 24b4bd0d7754..07fd30bc2f81 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -109,6 +109,7 @@ __parse_callchain_report_opt(const char *arg, bool allow_record_opt) bool record_opt_set = false; bool try_stack_size = false; + callchain_param.enabled = true; symbol_conf.use_callchain = true; if (!arg) @@ -117,6 +118,7 @@ __parse_callchain_report_opt(const char *arg, bool allow_record_opt) while ((tok = strtok((char *)arg, ",")) != NULL) { if (!strncmp(tok, "none", strlen(tok))) { callchain_param.mode = CHAIN_NONE; + callchain_param.enabled = false; symbol_conf.use_callchain = false; return 0; } @@ -788,7 +790,8 @@ int callchain_cursor_append(struct callchain_cursor *cursor, return 0; } -int sample__resolve_callchain(struct perf_sample *sample, struct symbol **parent, +int sample__resolve_callchain(struct perf_sample *sample, + struct callchain_cursor *cursor, struct symbol **parent, struct perf_evsel *evsel, struct addr_location *al, int max_stack) { @@ -796,8 +799,8 @@ int sample__resolve_callchain(struct perf_sample *sample, struct symbol **parent return 0; if (symbol_conf.use_callchain || symbol_conf.cumulate_callchain || - sort__has_parent) { - return thread__resolve_callchain(al->thread, evsel, sample, + perf_hpp_list.parent) { + return thread__resolve_callchain(al->thread, cursor, evsel, sample, parent, al, max_stack); } return 0; diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index d2a9e694810c..65e2a4f7cb4e 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -212,7 +212,14 @@ struct hist_entry; int record_parse_callchain_opt(const struct option *opt, const char *arg, int unset); int record_callchain_opt(const struct option *opt, const char *arg, int unset); -int sample__resolve_callchain(struct perf_sample *sample, struct symbol **parent, +struct record_opts; + +int record_opts__parse_callchain(struct record_opts *record, + struct callchain_param *callchain, + const char *arg, bool unset); + +int sample__resolve_callchain(struct perf_sample *sample, + struct callchain_cursor *cursor, struct symbol **parent, struct perf_evsel *evsel, struct addr_location *al, int max_stack); int hist_entry__append_callchain(struct hist_entry *he, struct perf_sample *sample); diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c index 4e727635476e..dad7d8272168 100644 --- a/tools/perf/util/config.c +++ b/tools/perf/util/config.c @@ -13,6 +13,7 @@ #include <subcmd/exec-cmd.h> #include "util/hist.h" /* perf_hist_config */ #include "util/llvm-utils.h" /* perf_llvm_config */ +#include "config.h" #define MAXNAME (256) @@ -377,6 +378,21 @@ const char *perf_config_dirname(const char *name, const char *value) return value; } +static int perf_buildid_config(const char *var, const char *value) +{ + /* same dir for all commands */ + if (!strcmp(var, "buildid.dir")) { + const char *dir = perf_config_dirname(var, value); + + if (!dir) + return -1; + strncpy(buildid_dir, dir, MAXPATHLEN-1); + buildid_dir[MAXPATHLEN-1] = '\0'; + } + + return 0; +} + static int perf_default_core_config(const char *var __maybe_unused, const char *value __maybe_unused) { @@ -412,6 +428,9 @@ int perf_default_config(const char *var, const char *value, if (!prefixcmp(var, "llvm.")) return perf_llvm_config(var, value); + if (!prefixcmp(var, "buildid.")) + return perf_buildid_config(var, value); + /* Add other config variables here. */ return 0; } @@ -506,41 +525,185 @@ out: return ret; } -/* - * Call this to report error for your variable that should not - * get a boolean value (i.e. "[my] var" means "true"). - */ -int config_error_nonbool(const char *var) +static struct perf_config_section *find_section(struct list_head *sections, + const char *section_name) { - return error("Missing value for '%s'", var); + struct perf_config_section *section; + + list_for_each_entry(section, sections, node) + if (!strcmp(section->name, section_name)) + return section; + + return NULL; +} + +static struct perf_config_item *find_config_item(const char *name, + struct perf_config_section *section) +{ + struct perf_config_item *item; + + list_for_each_entry(item, §ion->items, node) + if (!strcmp(item->name, name)) + return item; + + return NULL; } -struct buildid_dir_config { - char *dir; -}; +static struct perf_config_section *add_section(struct list_head *sections, + const char *section_name) +{ + struct perf_config_section *section = zalloc(sizeof(*section)); + + if (!section) + return NULL; + + INIT_LIST_HEAD(§ion->items); + section->name = strdup(section_name); + if (!section->name) { + pr_debug("%s: strdup failed\n", __func__); + free(section); + return NULL; + } + + list_add_tail(§ion->node, sections); + return section; +} -static int buildid_dir_command_config(const char *var, const char *value, - void *data) +static struct perf_config_item *add_config_item(struct perf_config_section *section, + const char *name) { - struct buildid_dir_config *c = data; - const char *v; + struct perf_config_item *item = zalloc(sizeof(*item)); - /* same dir for all commands */ - if (!strcmp(var, "buildid.dir")) { - v = perf_config_dirname(var, value); - if (!v) - return -1; - strncpy(c->dir, v, MAXPATHLEN-1); - c->dir[MAXPATHLEN-1] = '\0'; + if (!item) + return NULL; + + item->name = strdup(name); + if (!item->name) { + pr_debug("%s: strdup failed\n", __func__); + free(item); + return NULL; } + + list_add_tail(&item->node, §ion->items); + return item; +} + +static int set_value(struct perf_config_item *item, const char *value) +{ + char *val = strdup(value); + + if (!val) + return -1; + + zfree(&item->value); + item->value = val; return 0; } -static void check_buildid_dir_config(void) +static int collect_config(const char *var, const char *value, + void *perf_config_set) { - struct buildid_dir_config c; - c.dir = buildid_dir; - perf_config(buildid_dir_command_config, &c); + int ret = -1; + char *ptr, *key; + char *section_name, *name; + struct perf_config_section *section = NULL; + struct perf_config_item *item = NULL; + struct perf_config_set *set = perf_config_set; + struct list_head *sections = &set->sections; + + key = ptr = strdup(var); + if (!key) { + pr_debug("%s: strdup failed\n", __func__); + return -1; + } + + section_name = strsep(&ptr, "."); + name = ptr; + if (name == NULL || value == NULL) + goto out_free; + + section = find_section(sections, section_name); + if (!section) { + section = add_section(sections, section_name); + if (!section) + goto out_free; + } + + item = find_config_item(name, section); + if (!item) { + item = add_config_item(section, name); + if (!item) + goto out_free; + } + + ret = set_value(item, value); + return ret; + +out_free: + free(key); + perf_config_set__delete(set); + return -1; +} + +struct perf_config_set *perf_config_set__new(void) +{ + struct perf_config_set *set = zalloc(sizeof(*set)); + + if (set) { + INIT_LIST_HEAD(&set->sections); + perf_config(collect_config, set); + } + + return set; +} + +static void perf_config_item__delete(struct perf_config_item *item) +{ + zfree(&item->name); + zfree(&item->value); + free(item); +} + +static void perf_config_section__purge(struct perf_config_section *section) +{ + struct perf_config_item *item, *tmp; + + list_for_each_entry_safe(item, tmp, §ion->items, node) { + list_del_init(&item->node); + perf_config_item__delete(item); + } +} + +static void perf_config_section__delete(struct perf_config_section *section) +{ + perf_config_section__purge(section); + zfree(§ion->name); + free(section); +} + +static void perf_config_set__purge(struct perf_config_set *set) +{ + struct perf_config_section *section, *tmp; + + list_for_each_entry_safe(section, tmp, &set->sections, node) { + list_del_init(§ion->node); + perf_config_section__delete(section); + } +} + +void perf_config_set__delete(struct perf_config_set *set) +{ + perf_config_set__purge(set); + free(set); +} + +/* + * Call this to report error for your variable that should not + * get a boolean value (i.e. "[my] var" means "true"). + */ +int config_error_nonbool(const char *var) +{ + return error("Missing value for '%s'", var); } void set_buildid_dir(const char *dir) @@ -548,16 +711,13 @@ void set_buildid_dir(const char *dir) if (dir) scnprintf(buildid_dir, MAXPATHLEN-1, "%s", dir); - /* try config file */ - if (buildid_dir[0] == '\0') - check_buildid_dir_config(); - /* default to $HOME/.debug */ if (buildid_dir[0] == '\0') { - char *v = getenv("HOME"); - if (v) { + char *home = getenv("HOME"); + + if (home) { snprintf(buildid_dir, MAXPATHLEN-1, "%s/%s", - v, DEBUG_CACHE_DIR); + home, DEBUG_CACHE_DIR); } else { strncpy(buildid_dir, DEBUG_CACHE_DIR, MAXPATHLEN-1); } diff --git a/tools/perf/util/config.h b/tools/perf/util/config.h new file mode 100644 index 000000000000..22ec626ac718 --- /dev/null +++ b/tools/perf/util/config.h @@ -0,0 +1,26 @@ +#ifndef __PERF_CONFIG_H +#define __PERF_CONFIG_H + +#include <stdbool.h> +#include <linux/list.h> + +struct perf_config_item { + char *name; + char *value; + struct list_head node; +}; + +struct perf_config_section { + char *name; + struct list_head items; + struct list_head node; +}; + +struct perf_config_set { + struct list_head sections; +}; + +struct perf_config_set *perf_config_set__new(void); +void perf_config_set__delete(struct perf_config_set *set); + +#endif /* __PERF_CONFIG_H */ diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 9bcf2bed3a6d..02d801670f30 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -587,3 +587,15 @@ int cpu__setup_cpunode_map(void) closedir(dir1); return 0; } + +bool cpu_map__has(struct cpu_map *cpus, int cpu) +{ + int i; + + for (i = 0; i < cpus->nr; ++i) { + if (cpus->map[i] == cpu) + return true; + } + + return false; +} diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 81a2562aaa2b..1a0a35073ce1 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -66,4 +66,6 @@ int cpu__get_node(int cpu); int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res, int (*f)(struct cpu_map *map, int cpu, void *data), void *data); + +bool cpu_map__has(struct cpu_map *cpus, int cpu); #endif /* __PERF_CPUMAP_H */ diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c index 1921942fc2e0..be83516155ee 100644 --- a/tools/perf/util/data.c +++ b/tools/perf/util/data.c @@ -136,3 +136,44 @@ ssize_t perf_data_file__write(struct perf_data_file *file, { return writen(file->fd, buf, size); } + +int perf_data_file__switch(struct perf_data_file *file, + const char *postfix, + size_t pos, bool at_exit) +{ + char *new_filepath; + int ret; + + if (check_pipe(file)) + return -EINVAL; + if (perf_data_file__is_read(file)) + return -EINVAL; + + if (asprintf(&new_filepath, "%s.%s", file->path, postfix) < 0) + return -ENOMEM; + + /* + * Only fire a warning, don't return error, continue fill + * original file. + */ + if (rename(file->path, new_filepath)) + pr_warning("Failed to rename %s to %s\n", file->path, new_filepath); + + if (!at_exit) { + close(file->fd); + ret = perf_data_file__open(file); + if (ret < 0) + goto out; + + if (lseek(file->fd, pos, SEEK_SET) == (off_t)-1) { + ret = -errno; + pr_debug("Failed to lseek to %zu: %s", + pos, strerror(errno)); + goto out; + } + } + ret = file->fd; +out: + free(new_filepath); + return ret; +} diff --git a/tools/perf/util/data.h b/tools/perf/util/data.h index 2b15d0c95c7f..ae510ce16cb1 100644 --- a/tools/perf/util/data.h +++ b/tools/perf/util/data.h @@ -46,5 +46,14 @@ int perf_data_file__open(struct perf_data_file *file); void perf_data_file__close(struct perf_data_file *file); ssize_t perf_data_file__write(struct perf_data_file *file, void *buf, size_t size); - +/* + * If at_exit is set, only rename current perf.data to + * perf.data.<postfix>, continue write on original file. + * Set at_exit when flushing the last output. + * + * Return value is fd of new output. + */ +int perf_data_file__switch(struct perf_data_file *file, + const char *postfix, + size_t pos, bool at_exit); #endif /* __PERF_DATA_H */ diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c index 049438d51b9a..8d96c80cc67e 100644 --- a/tools/perf/util/db-export.c +++ b/tools/perf/util/db-export.c @@ -23,6 +23,8 @@ #include "event.h" #include "util.h" #include "thread-stack.h" +#include "callchain.h" +#include "call-path.h" #include "db-export.h" struct deferred_export { @@ -258,8 +260,7 @@ static int db_ids_from_al(struct db_export *dbe, struct addr_location *al, if (!al->sym) { al->sym = symbol__new(al->addr, 0, 0, "unknown"); if (al->sym) - symbols__insert(&dso->symbols[al->map->type], - al->sym); + dso__insert_symbol(dso, al->map->type, al->sym); } if (al->sym) { @@ -276,6 +277,80 @@ static int db_ids_from_al(struct db_export *dbe, struct addr_location *al, return 0; } +static struct call_path *call_path_from_sample(struct db_export *dbe, + struct machine *machine, + struct thread *thread, + struct perf_sample *sample, + struct perf_evsel *evsel) +{ + u64 kernel_start = machine__kernel_start(machine); + struct call_path *current = &dbe->cpr->call_path; + enum chain_order saved_order = callchain_param.order; + int err; + + if (!symbol_conf.use_callchain || !sample->callchain) + return NULL; + + /* + * Since the call path tree must be built starting with the root, we + * must use ORDER_CALL for call chain resolution, in order to process + * the callchain starting with the root node and ending with the leaf. + */ + callchain_param.order = ORDER_CALLER; + err = thread__resolve_callchain(thread, &callchain_cursor, evsel, + sample, NULL, NULL, + sysctl_perf_event_max_stack); + if (err) { + callchain_param.order = saved_order; + return NULL; + } + callchain_cursor_commit(&callchain_cursor); + + while (1) { + struct callchain_cursor_node *node; + struct addr_location al; + u64 dso_db_id = 0, sym_db_id = 0, offset = 0; + + memset(&al, 0, sizeof(al)); + + node = callchain_cursor_current(&callchain_cursor); + if (!node) + break; + /* + * Handle export of symbol and dso for this node by + * constructing an addr_location struct and then passing it to + * db_ids_from_al() to perform the export. + */ + al.sym = node->sym; + al.map = node->map; + al.machine = machine; + al.addr = node->ip; + + if (al.map && !al.sym) + al.sym = dso__find_symbol(al.map->dso, MAP__FUNCTION, + al.addr); + + db_ids_from_al(dbe, &al, &dso_db_id, &sym_db_id, &offset); + + /* add node to the call path tree if it doesn't exist */ + current = call_path__findnew(dbe->cpr, current, + al.sym, node->ip, + kernel_start); + + callchain_cursor_advance(&callchain_cursor); + } + + /* Reset the callchain order to its prior value. */ + callchain_param.order = saved_order; + + if (current == &dbe->cpr->call_path) { + /* Bail because the callchain was empty. */ + return NULL; + } + + return current; +} + int db_export__branch_type(struct db_export *dbe, u32 branch_type, const char *name) { @@ -329,6 +404,16 @@ int db_export__sample(struct db_export *dbe, union perf_event *event, if (err) goto out_put; + if (dbe->cpr) { + struct call_path *cp = call_path_from_sample(dbe, al->machine, + thread, sample, + evsel); + if (cp) { + db_export__call_path(dbe, cp); + es.call_path_id = cp->db_id; + } + } + if ((evsel->attr.sample_type & PERF_SAMPLE_ADDR) && sample_addr_correlates_sym(&evsel->attr)) { struct addr_location addr_al; diff --git a/tools/perf/util/db-export.h b/tools/perf/util/db-export.h index 25e22fd76aca..67bc6b8ad2d6 100644 --- a/tools/perf/util/db-export.h +++ b/tools/perf/util/db-export.h @@ -27,6 +27,7 @@ struct dso; struct perf_sample; struct addr_location; struct call_return_processor; +struct call_path_root; struct call_path; struct call_return; @@ -43,6 +44,7 @@ struct export_sample { u64 addr_dso_db_id; u64 addr_sym_db_id; u64 addr_offset; /* addr offset from symbol start */ + u64 call_path_id; }; struct db_export { @@ -64,6 +66,7 @@ struct db_export { int (*export_call_return)(struct db_export *dbe, struct call_return *cr); struct call_return_processor *crp; + struct call_path_root *cpr; u64 evsel_last_db_id; u64 machine_last_db_id; u64 thread_last_db_id; diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index 8e6395439ca0..3357479082ca 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -38,7 +38,7 @@ int dso__read_binary_type_filename(const struct dso *dso, enum dso_binary_type type, char *root_dir, char *filename, size_t size) { - char build_id_hex[BUILD_ID_SIZE * 2 + 1]; + char build_id_hex[SBUILD_ID_SIZE]; int ret = 0; size_t len; @@ -1301,7 +1301,7 @@ size_t __dsos__fprintf(struct list_head *head, FILE *fp) size_t dso__fprintf_buildid(struct dso *dso, FILE *fp) { - char sbuild_id[BUILD_ID_SIZE * 2 + 1]; + char sbuild_id[SBUILD_ID_SIZE]; build_id__sprintf(dso->build_id, sizeof(dso->build_id), sbuild_id); return fprintf(fp, "%s", sbuild_id); diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index 577e600c8eb1..a347b19c961a 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -915,8 +915,7 @@ int die_get_typename(Dwarf_Die *vr_die, struct strbuf *buf) tmp = "*"; else if (tag == DW_TAG_subroutine_type) { /* Function pointer */ - strbuf_add(buf, "(function_type)", 15); - return 0; + return strbuf_add(buf, "(function_type)", 15); } else { if (!dwarf_diename(&type)) return -ENOENT; @@ -927,14 +926,10 @@ int die_get_typename(Dwarf_Die *vr_die, struct strbuf *buf) else if (tag == DW_TAG_enumeration_type) tmp = "enum "; /* Write a base name */ - strbuf_addf(buf, "%s%s", tmp, dwarf_diename(&type)); - return 0; + return strbuf_addf(buf, "%s%s", tmp, dwarf_diename(&type)); } ret = die_get_typename(&type, buf); - if (ret == 0) - strbuf_addstr(buf, tmp); - - return ret; + return ret ? ret : strbuf_addstr(buf, tmp); } /** @@ -951,14 +946,13 @@ int die_get_varname(Dwarf_Die *vr_die, struct strbuf *buf) ret = die_get_typename(vr_die, buf); if (ret < 0) { pr_debug("Failed to get type, make it unknown.\n"); - strbuf_add(buf, " (unknown_type)", 14); + ret = strbuf_add(buf, " (unknown_type)", 14); } - strbuf_addf(buf, "\t%s", dwarf_diename(vr_die)); - - return 0; + return ret < 0 ? ret : strbuf_addf(buf, "\t%s", dwarf_diename(vr_die)); } +#ifdef HAVE_DWARF_GETLOCATIONS /** * die_get_var_innermost_scope - Get innermost scope range of given variable DIE * @sp_die: a subprogram DIE @@ -998,22 +992,24 @@ static int die_get_var_innermost_scope(Dwarf_Die *sp_die, Dwarf_Die *vr_die, } while ((offset = dwarf_ranges(&scopes[1], offset, &base, - &start, &end)) > 0) { + &start, &end)) > 0) { start -= entry; end -= entry; if (first) { - strbuf_addf(buf, "@<%s+[%" PRIu64 "-%" PRIu64, - name, start, end); + ret = strbuf_addf(buf, "@<%s+[%" PRIu64 "-%" PRIu64, + name, start, end); first = false; } else { - strbuf_addf(buf, ",%" PRIu64 "-%" PRIu64, - start, end); + ret = strbuf_addf(buf, ",%" PRIu64 "-%" PRIu64, + start, end); } + if (ret < 0) + goto out; } if (!first) - strbuf_add(buf, "]>", 2); + ret = strbuf_add(buf, "]>", 2); out: free(scopes); @@ -1053,30 +1049,39 @@ int die_get_var_range(Dwarf_Die *sp_die, Dwarf_Die *vr_die, struct strbuf *buf) if (dwarf_attr(vr_die, DW_AT_location, &attr) == NULL) return -EINVAL; - while ((offset = dwarf_getlocations( - &attr, offset, &base, - &start, &end, &op, &nops)) > 0) { + while ((offset = dwarf_getlocations(&attr, offset, &base, + &start, &end, &op, &nops)) > 0) { if (start == 0) { /* Single Location Descriptions */ ret = die_get_var_innermost_scope(sp_die, vr_die, buf); - return ret; + goto out; } /* Location Lists */ start -= entry; end -= entry; if (first) { - strbuf_addf(buf, "@<%s+[%" PRIu64 "-%" PRIu64, - name, start, end); + ret = strbuf_addf(buf, "@<%s+[%" PRIu64 "-%" PRIu64, + name, start, end); first = false; } else { - strbuf_addf(buf, ",%" PRIu64 "-%" PRIu64, - start, end); + ret = strbuf_addf(buf, ",%" PRIu64 "-%" PRIu64, + start, end); } + if (ret < 0) + goto out; } if (!first) - strbuf_add(buf, "]>", 2); - + ret = strbuf_add(buf, "]>", 2); +out: return ret; } +#else +int die_get_var_range(Dwarf_Die *sp_die __maybe_unused, + Dwarf_Die *vr_die __maybe_unused, + struct strbuf *buf __maybe_unused) +{ + return -ENOTSUP; +} +#endif diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index dad55d04ffdd..f6fcc6832949 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -45,6 +45,7 @@ static const char *perf_event__names[] = { [PERF_RECORD_STAT] = "STAT", [PERF_RECORD_STAT_ROUND] = "STAT_ROUND", [PERF_RECORD_EVENT_UPDATE] = "EVENT_UPDATE", + [PERF_RECORD_TIME_CONV] = "TIME_CONV", }; const char *perf_event__name(unsigned int id) @@ -433,7 +434,7 @@ static int __event__synthesize_thread(union perf_event *comm_event, { char filename[PATH_MAX]; DIR *tasks; - struct dirent dirent, *next; + struct dirent *dirent; pid_t tgid, ppid; int rc = 0; @@ -462,11 +463,11 @@ static int __event__synthesize_thread(union perf_event *comm_event, return 0; } - while (!readdir_r(tasks, &dirent, &next) && next) { + while ((dirent = readdir(tasks)) != NULL) { char *end; pid_t _pid; - _pid = strtol(dirent.d_name, &end, 10); + _pid = strtol(dirent->d_name, &end, 10); if (*end) continue; @@ -575,7 +576,7 @@ int perf_event__synthesize_threads(struct perf_tool *tool, { DIR *proc; char proc_path[PATH_MAX]; - struct dirent dirent, *next; + struct dirent *dirent; union perf_event *comm_event, *mmap_event, *fork_event; int err = -1; @@ -600,9 +601,9 @@ int perf_event__synthesize_threads(struct perf_tool *tool, if (proc == NULL) goto out_free_fork; - while (!readdir_r(proc, &dirent, &next) && next) { + while ((dirent = readdir(proc)) != NULL) { char *end; - pid_t pid = strtol(dirent.d_name, &end, 10); + pid_t pid = strtol(dirent->d_name, &end, 10); if (*end) /* only interested in proper numerical dirents */ continue; diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 6bb1c928350d..8d363d5e65a2 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -233,6 +233,7 @@ enum perf_user_event_type { /* above any possible kernel type */ PERF_RECORD_STAT = 76, PERF_RECORD_STAT_ROUND = 77, PERF_RECORD_EVENT_UPDATE = 78, + PERF_RECORD_TIME_CONV = 79, PERF_RECORD_HEADER_MAX }; @@ -469,6 +470,13 @@ struct stat_round_event { u64 time; }; +struct time_conv_event { + struct perf_event_header header; + u64 time_shift; + u64 time_mult; + u64 time_zero; +}; + union perf_event { struct perf_event_header header; struct mmap_event mmap; @@ -497,6 +505,7 @@ union perf_event { struct stat_config_event stat_config; struct stat_event stat; struct stat_round_event stat_round; + struct time_conv_event time_conv; }; void perf_event__print_totals(void); diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 86a03836a83f..c4bfe11479a0 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -679,53 +679,52 @@ static struct perf_evsel *perf_evlist__event2evsel(struct perf_evlist *evlist, return NULL; } -union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx) +/* When check_messup is true, 'end' must points to a good entry */ +static union perf_event * +perf_mmap__read(struct perf_mmap *md, bool check_messup, u64 start, + u64 end, u64 *prev) { - struct perf_mmap *md = &evlist->mmap[idx]; - u64 head; - u64 old = md->prev; unsigned char *data = md->base + page_size; union perf_event *event = NULL; + int diff = end - start; - /* - * Check if event was unmapped due to a POLLHUP/POLLERR. - */ - if (!atomic_read(&md->refcnt)) - return NULL; - - head = perf_mmap__read_head(md); - if (evlist->overwrite) { + if (check_messup) { /* * If we're further behind than half the buffer, there's a chance * the writer will bite our tail and mess up the samples under us. * - * If we somehow ended up ahead of the head, we got messed up. + * If we somehow ended up ahead of the 'end', we got messed up. * - * In either case, truncate and restart at head. + * In either case, truncate and restart at 'end'. */ - int diff = head - old; if (diff > md->mask / 2 || diff < 0) { fprintf(stderr, "WARNING: failed to keep up with mmap data.\n"); /* - * head points to a known good entry, start there. + * 'end' points to a known good entry, start there. */ - old = head; + start = end; + diff = 0; } } - if (old != head) { + if (diff >= (int)sizeof(event->header)) { size_t size; - event = (union perf_event *)&data[old & md->mask]; + event = (union perf_event *)&data[start & md->mask]; size = event->header.size; + if (size < sizeof(event->header) || diff < (int)size) { + event = NULL; + goto broken_event; + } + /* * Event straddles the mmap boundary -- header should always * be inside due to u64 alignment of output. */ - if ((old & md->mask) + size != ((old + size) & md->mask)) { - unsigned int offset = old; + if ((start & md->mask) + size != ((start + size) & md->mask)) { + unsigned int offset = start; unsigned int len = min(sizeof(*event), size), cpy; void *dst = md->event_copy; @@ -740,14 +739,83 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx) event = (union perf_event *) md->event_copy; } - old += size; + start += size; } - md->prev = old; +broken_event: + if (prev) + *prev = start; return event; } +union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx) +{ + struct perf_mmap *md = &evlist->mmap[idx]; + u64 head; + u64 old = md->prev; + + /* + * Check if event was unmapped due to a POLLHUP/POLLERR. + */ + if (!atomic_read(&md->refcnt)) + return NULL; + + head = perf_mmap__read_head(md); + + return perf_mmap__read(md, evlist->overwrite, old, head, &md->prev); +} + +union perf_event * +perf_evlist__mmap_read_backward(struct perf_evlist *evlist, int idx) +{ + struct perf_mmap *md = &evlist->mmap[idx]; + u64 head, end; + u64 start = md->prev; + + /* + * Check if event was unmapped due to a POLLHUP/POLLERR. + */ + if (!atomic_read(&md->refcnt)) + return NULL; + + head = perf_mmap__read_head(md); + if (!head) + return NULL; + + /* + * 'head' pointer starts from 0. Kernel minus sizeof(record) form + * it each time when kernel writes to it, so in fact 'head' is + * negative. 'end' pointer is made manually by adding the size of + * the ring buffer to 'head' pointer, means the validate data can + * read is the whole ring buffer. If 'end' is positive, the ring + * buffer has not fully filled, so we must adjust 'end' to 0. + * + * However, since both 'head' and 'end' is unsigned, we can't + * simply compare 'end' against 0. Here we compare '-head' and + * the size of the ring buffer, where -head is the number of bytes + * kernel write to the ring buffer. + */ + if (-head < (u64)(md->mask + 1)) + end = 0; + else + end = head + md->mask + 1; + + return perf_mmap__read(md, false, start, end, &md->prev); +} + +void perf_evlist__mmap_read_catchup(struct perf_evlist *evlist, int idx) +{ + struct perf_mmap *md = &evlist->mmap[idx]; + u64 head; + + if (!atomic_read(&md->refcnt)) + return; + + head = perf_mmap__read_head(md); + md->prev = head; +} + static bool perf_mmap__empty(struct perf_mmap *md) { return perf_mmap__read_head(md) == md->prev && !md->auxtrace_mmap.base; @@ -986,26 +1054,34 @@ out_unmap: return -1; } -static size_t perf_evlist__mmap_size(unsigned long pages) +unsigned long perf_event_mlock_kb_in_pages(void) { - if (pages == UINT_MAX) { - int max; + unsigned long pages; + int max; - if (sysctl__read_int("kernel/perf_event_mlock_kb", &max) < 0) { - /* - * Pick a once upon a time good value, i.e. things look - * strange since we can't read a sysctl value, but lets not - * die yet... - */ - max = 512; - } else { - max -= (page_size / 1024); - } + if (sysctl__read_int("kernel/perf_event_mlock_kb", &max) < 0) { + /* + * Pick a once upon a time good value, i.e. things look + * strange since we can't read a sysctl value, but lets not + * die yet... + */ + max = 512; + } else { + max -= (page_size / 1024); + } - pages = (max * 1024) / page_size; - if (!is_power_of_2(pages)) - pages = rounddown_pow_of_two(pages); - } else if (!is_power_of_2(pages)) + pages = (max * 1024) / page_size; + if (!is_power_of_2(pages)) + pages = rounddown_pow_of_two(pages); + + return pages; +} + +static size_t perf_evlist__mmap_size(unsigned long pages) +{ + if (pages == UINT_MAX) + pages = perf_event_mlock_kb_in_pages(); + else if (!is_power_of_2(pages)) return 0; return (pages + 1) * page_size; @@ -1192,6 +1268,24 @@ void perf_evlist__set_maps(struct perf_evlist *evlist, struct cpu_map *cpus, perf_evlist__propagate_maps(evlist); } +void __perf_evlist__set_sample_bit(struct perf_evlist *evlist, + enum perf_event_sample_format bit) +{ + struct perf_evsel *evsel; + + evlist__for_each(evlist, evsel) + __perf_evsel__set_sample_bit(evsel, bit); +} + +void __perf_evlist__reset_sample_bit(struct perf_evlist *evlist, + enum perf_event_sample_format bit) +{ + struct perf_evsel *evsel; + + evlist__for_each(evlist, evsel) + __perf_evsel__reset_sample_bit(evsel, bit); +} + int perf_evlist__apply_filters(struct perf_evlist *evlist, struct perf_evsel **err_evsel) { struct perf_evsel *evsel; diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index a0d15221db6e..85d1b59802e8 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -87,6 +87,17 @@ int perf_evlist__add_dummy(struct perf_evlist *evlist); int perf_evlist__add_newtp(struct perf_evlist *evlist, const char *sys, const char *name, void *handler); +void __perf_evlist__set_sample_bit(struct perf_evlist *evlist, + enum perf_event_sample_format bit); +void __perf_evlist__reset_sample_bit(struct perf_evlist *evlist, + enum perf_event_sample_format bit); + +#define perf_evlist__set_sample_bit(evlist, bit) \ + __perf_evlist__set_sample_bit(evlist, PERF_SAMPLE_##bit) + +#define perf_evlist__reset_sample_bit(evlist, bit) \ + __perf_evlist__reset_sample_bit(evlist, PERF_SAMPLE_##bit) + int perf_evlist__set_filter(struct perf_evlist *evlist, const char *filter); int perf_evlist__set_filter_pid(struct perf_evlist *evlist, pid_t pid); int perf_evlist__set_filter_pids(struct perf_evlist *evlist, size_t npids, pid_t *pids); @@ -118,16 +129,23 @@ struct perf_sample_id *perf_evlist__id2sid(struct perf_evlist *evlist, u64 id); union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx); +union perf_event *perf_evlist__mmap_read_backward(struct perf_evlist *evlist, + int idx); +void perf_evlist__mmap_read_catchup(struct perf_evlist *evlist, int idx); + void perf_evlist__mmap_consume(struct perf_evlist *evlist, int idx); int perf_evlist__open(struct perf_evlist *evlist); void perf_evlist__close(struct perf_evlist *evlist); +struct callchain_param; + void perf_evlist__set_id_pos(struct perf_evlist *evlist); bool perf_can_sample_identifier(void); bool perf_can_record_switch_events(void); bool perf_can_record_cpu_wide(void); -void perf_evlist__config(struct perf_evlist *evlist, struct record_opts *opts); +void perf_evlist__config(struct perf_evlist *evlist, struct record_opts *opts, + struct callchain_param *callchain); int record_opts__config(struct record_opts *opts); int perf_evlist__prepare_workload(struct perf_evlist *evlist, @@ -144,6 +162,8 @@ int perf_evlist__parse_mmap_pages(const struct option *opt, const char *str, int unset); +unsigned long perf_event_mlock_kb_in_pages(void); + int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages, bool overwrite, unsigned int auxtrace_pages, bool auxtrace_overwrite); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 738ce226002b..964c7c3602c0 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -226,7 +226,8 @@ struct perf_evsel *perf_evsel__new_idx(struct perf_event_attr *attr, int idx) perf_evsel__init(evsel, attr, idx); if (perf_evsel__is_bpf_output(evsel)) { - evsel->attr.sample_type |= PERF_SAMPLE_RAW; + evsel->attr.sample_type |= (PERF_SAMPLE_RAW | PERF_SAMPLE_TIME | + PERF_SAMPLE_CPU | PERF_SAMPLE_PERIOD), evsel->attr.sample_period = 1; } @@ -561,10 +562,9 @@ int perf_evsel__group_desc(struct perf_evsel *evsel, char *buf, size_t size) return ret; } -static void -perf_evsel__config_callgraph(struct perf_evsel *evsel, - struct record_opts *opts, - struct callchain_param *param) +void perf_evsel__config_callchain(struct perf_evsel *evsel, + struct record_opts *opts, + struct callchain_param *param) { bool function = perf_evsel__is_function_event(evsel); struct perf_event_attr *attr = &evsel->attr; @@ -704,7 +704,7 @@ static void apply_config_terms(struct perf_evsel *evsel, /* set perf-event callgraph */ if (param.enabled) - perf_evsel__config_callgraph(evsel, opts, ¶m); + perf_evsel__config_callchain(evsel, opts, ¶m); } } @@ -736,7 +736,8 @@ static void apply_config_terms(struct perf_evsel *evsel, * enable/disable events specifically, as there's no * initial traced exec call. */ -void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts) +void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts, + struct callchain_param *callchain) { struct perf_evsel *leader = evsel->leader; struct perf_event_attr *attr = &evsel->attr; @@ -811,8 +812,8 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts) if (perf_evsel__is_function_event(evsel)) evsel->attr.exclude_callchain_user = 1; - if (callchain_param.enabled && !evsel->no_aux_samples) - perf_evsel__config_callgraph(evsel, opts, &callchain_param); + if (callchain && callchain->enabled && !evsel->no_aux_samples) + perf_evsel__config_callchain(evsel, opts, callchain); if (opts->sample_intr_regs) { attr->sample_regs_intr = opts->sample_intr_regs; @@ -1230,6 +1231,21 @@ static void __p_sample_type(char *buf, size_t size, u64 value) __p_bits(buf, size, value, bits); } +static void __p_branch_sample_type(char *buf, size_t size, u64 value) +{ +#define bit_name(n) { PERF_SAMPLE_BRANCH_##n, #n } + struct bit_names bits[] = { + bit_name(USER), bit_name(KERNEL), bit_name(HV), bit_name(ANY), + bit_name(ANY_CALL), bit_name(ANY_RETURN), bit_name(IND_CALL), + bit_name(ABORT_TX), bit_name(IN_TX), bit_name(NO_TX), + bit_name(COND), bit_name(CALL_STACK), bit_name(IND_JUMP), + bit_name(CALL), bit_name(NO_FLAGS), bit_name(NO_CYCLES), + { .name = NULL, } + }; +#undef bit_name + __p_bits(buf, size, value, bits); +} + static void __p_read_format(char *buf, size_t size, u64 value) { #define bit_name(n) { PERF_FORMAT_##n, #n } @@ -1248,6 +1264,7 @@ static void __p_read_format(char *buf, size_t size, u64 value) #define p_unsigned(val) snprintf(buf, BUF_SIZE, "%"PRIu64, (uint64_t)(val)) #define p_signed(val) snprintf(buf, BUF_SIZE, "%"PRId64, (int64_t)(val)) #define p_sample_type(val) __p_sample_type(buf, BUF_SIZE, val) +#define p_branch_sample_type(val) __p_branch_sample_type(buf, BUF_SIZE, val) #define p_read_format(val) __p_read_format(buf, BUF_SIZE, val) #define PRINT_ATTRn(_n, _f, _p) \ @@ -1299,12 +1316,13 @@ int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr, PRINT_ATTRf(comm_exec, p_unsigned); PRINT_ATTRf(use_clockid, p_unsigned); PRINT_ATTRf(context_switch, p_unsigned); + PRINT_ATTRf(write_backward, p_unsigned); PRINT_ATTRn("{ wakeup_events, wakeup_watermark }", wakeup_events, p_unsigned); PRINT_ATTRf(bp_type, p_unsigned); PRINT_ATTRn("{ bp_addr, config1 }", bp_addr, p_hex); PRINT_ATTRn("{ bp_len, config2 }", bp_len, p_hex); - PRINT_ATTRf(branch_sample_type, p_unsigned); + PRINT_ATTRf(branch_sample_type, p_branch_sample_type); PRINT_ATTRf(sample_regs_user, p_hex); PRINT_ATTRf(sample_stack_user, p_unsigned); PRINT_ATTRf(clockid, p_signed); @@ -2253,98 +2271,11 @@ u64 perf_evsel__intval(struct perf_evsel *evsel, struct perf_sample *sample, return 0; } -static int comma_fprintf(FILE *fp, bool *first, const char *fmt, ...) -{ - va_list args; - int ret = 0; - - if (!*first) { - ret += fprintf(fp, ","); - } else { - ret += fprintf(fp, ":"); - *first = false; - } - - va_start(args, fmt); - ret += vfprintf(fp, fmt, args); - va_end(args); - return ret; -} - -static int __print_attr__fprintf(FILE *fp, const char *name, const char *val, void *priv) -{ - return comma_fprintf(fp, (bool *)priv, " %s: %s", name, val); -} - -int perf_evsel__fprintf(struct perf_evsel *evsel, - struct perf_attr_details *details, FILE *fp) -{ - bool first = true; - int printed = 0; - - if (details->event_group) { - struct perf_evsel *pos; - - if (!perf_evsel__is_group_leader(evsel)) - return 0; - - if (evsel->nr_members > 1) - printed += fprintf(fp, "%s{", evsel->group_name ?: ""); - - printed += fprintf(fp, "%s", perf_evsel__name(evsel)); - for_each_group_member(pos, evsel) - printed += fprintf(fp, ",%s", perf_evsel__name(pos)); - - if (evsel->nr_members > 1) - printed += fprintf(fp, "}"); - goto out; - } - - printed += fprintf(fp, "%s", perf_evsel__name(evsel)); - - if (details->verbose) { - printed += perf_event_attr__fprintf(fp, &evsel->attr, - __print_attr__fprintf, &first); - } else if (details->freq) { - const char *term = "sample_freq"; - - if (!evsel->attr.freq) - term = "sample_period"; - - printed += comma_fprintf(fp, &first, " %s=%" PRIu64, - term, (u64)evsel->attr.sample_freq); - } - - if (details->trace_fields) { - struct format_field *field; - - if (evsel->attr.type != PERF_TYPE_TRACEPOINT) { - printed += comma_fprintf(fp, &first, " (not a tracepoint)"); - goto out; - } - - field = evsel->tp_format->format.fields; - if (field == NULL) { - printed += comma_fprintf(fp, &first, " (no trace field)"); - goto out; - } - - printed += comma_fprintf(fp, &first, " trace_fields: %s", field->name); - - field = field->next; - while (field) { - printed += comma_fprintf(fp, &first, "%s", field->name); - field = field->next; - } - } -out: - fputc('\n', fp); - return ++printed; -} - bool perf_evsel__fallback(struct perf_evsel *evsel, int err, char *msg, size_t msgsize) { + int paranoid; + if ((err == ENOENT || err == ENXIO || err == ENODEV) && evsel->attr.type == PERF_TYPE_HARDWARE && evsel->attr.config == PERF_COUNT_HW_CPU_CYCLES) { @@ -2364,6 +2295,22 @@ bool perf_evsel__fallback(struct perf_evsel *evsel, int err, zfree(&evsel->name); return true; + } else if (err == EACCES && !evsel->attr.exclude_kernel && + (paranoid = perf_event_paranoid()) > 1) { + const char *name = perf_evsel__name(evsel); + char *new_name; + + if (asprintf(&new_name, "%s%su", name, strchr(name, ':') ? "" : ":") < 0) + return false; + + if (evsel->name) + free(evsel->name); + evsel->name = new_name; + scnprintf(msg, msgsize, +"kernel.perf_event_paranoid=%d, trying to fall back to excluding kernel samples", paranoid); + evsel->attr.exclude_kernel = 1; + + return true; } return false; @@ -2382,12 +2329,13 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target, "Consider tweaking /proc/sys/kernel/perf_event_paranoid,\n" "which controls use of the performance events system by\n" "unprivileged users (without CAP_SYS_ADMIN).\n\n" - "The default value is 1:\n\n" + "The current value is %d:\n\n" " -1: Allow use of (almost) all events by all users\n" ">= 0: Disallow raw tracepoint access by users without CAP_IOC_LOCK\n" ">= 1: Disallow CPU event access by users without CAP_SYS_ADMIN\n" ">= 2: Disallow kernel profiling by users without CAP_SYS_ADMIN", - target->system_wide ? "system-wide " : ""); + target->system_wide ? "system-wide " : "", + perf_event_paranoid()); case ENOENT: return scnprintf(msg, size, "The %s event is not supported.", perf_evsel__name(evsel)); @@ -2397,10 +2345,18 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target, "Probably the maximum number of open file descriptors has been reached.\n" "Hint: Try again after reducing the number of events.\n" "Hint: Try increasing the limit with 'ulimit -n <limit>'"); + case ENOMEM: + if ((evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN) != 0 && + access("/proc/sys/kernel/perf_event_max_stack", F_OK) == 0) + return scnprintf(msg, size, + "Not enough memory to setup event with callchain.\n" + "Hint: Try tweaking /proc/sys/kernel/perf_event_max_stack\n" + "Hint: Current value: %d", sysctl_perf_event_max_stack); + break; case ENODEV: if (target->cpu_list) return scnprintf(msg, size, "%s", - "No such device - did you specify an out-of-range profile CPU?\n"); + "No such device - did you specify an out-of-range profile CPU?"); break; case EOPNOTSUPP: if (evsel->attr.precise_ip) @@ -2432,7 +2388,7 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target, return scnprintf(msg, size, "The sys_perf_event_open() syscall returned with %d (%s) for event (%s).\n" "/bin/dmesg may provide additional information.\n" - "No CONFIG_PERF_EVENTS=y kernel support configured?\n", + "No CONFIG_PERF_EVENTS=y kernel support configured?", err, strerror_r(err, sbuf, sizeof(sbuf)), perf_evsel__name(evsel)); } diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 501ea6e565f1..8a644fef452c 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -178,8 +178,14 @@ void perf_evsel__init(struct perf_evsel *evsel, void perf_evsel__exit(struct perf_evsel *evsel); void perf_evsel__delete(struct perf_evsel *evsel); +struct callchain_param; + void perf_evsel__config(struct perf_evsel *evsel, - struct record_opts *opts); + struct record_opts *opts, + struct callchain_param *callchain); +void perf_evsel__config_callchain(struct perf_evsel *evsel, + struct record_opts *opts, + struct callchain_param *callchain); int __perf_evsel__sample_size(u64 sample_type); void perf_evsel__calc_id_pos(struct perf_evsel *evsel); @@ -381,6 +387,24 @@ struct perf_attr_details { int perf_evsel__fprintf(struct perf_evsel *evsel, struct perf_attr_details *details, FILE *fp); +#define EVSEL__PRINT_IP (1<<0) +#define EVSEL__PRINT_SYM (1<<1) +#define EVSEL__PRINT_DSO (1<<2) +#define EVSEL__PRINT_SYMOFFSET (1<<3) +#define EVSEL__PRINT_ONELINE (1<<4) +#define EVSEL__PRINT_SRCLINE (1<<5) +#define EVSEL__PRINT_UNKNOWN_AS_ADDR (1<<6) + +struct callchain_cursor; + +int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment, + unsigned int print_opts, + struct callchain_cursor *cursor, FILE *fp); + +int sample__fprintf_sym(struct perf_sample *sample, struct addr_location *al, + int left_alignment, unsigned int print_opts, + struct callchain_cursor *cursor, FILE *fp); + bool perf_evsel__fallback(struct perf_evsel *evsel, int err, char *msg, size_t msgsize); int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target, @@ -396,7 +420,7 @@ for ((_evsel) = list_entry((_leader)->node.next, struct perf_evsel, node); \ (_evsel) && (_evsel)->leader == (_leader); \ (_evsel) = list_entry((_evsel)->node.next, struct perf_evsel, node)) -static inline bool has_branch_callstack(struct perf_evsel *evsel) +static inline bool perf_evsel__has_branch_callstack(const struct perf_evsel *evsel) { return evsel->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK; } diff --git a/tools/perf/util/evsel_fprintf.c b/tools/perf/util/evsel_fprintf.c new file mode 100644 index 000000000000..3674e77ad640 --- /dev/null +++ b/tools/perf/util/evsel_fprintf.c @@ -0,0 +1,212 @@ +#include <stdio.h> +#include <stdbool.h> +#include <traceevent/event-parse.h> +#include "evsel.h" +#include "callchain.h" +#include "map.h" +#include "symbol.h" + +static int comma_fprintf(FILE *fp, bool *first, const char *fmt, ...) +{ + va_list args; + int ret = 0; + + if (!*first) { + ret += fprintf(fp, ","); + } else { + ret += fprintf(fp, ":"); + *first = false; + } + + va_start(args, fmt); + ret += vfprintf(fp, fmt, args); + va_end(args); + return ret; +} + +static int __print_attr__fprintf(FILE *fp, const char *name, const char *val, void *priv) +{ + return comma_fprintf(fp, (bool *)priv, " %s: %s", name, val); +} + +int perf_evsel__fprintf(struct perf_evsel *evsel, + struct perf_attr_details *details, FILE *fp) +{ + bool first = true; + int printed = 0; + + if (details->event_group) { + struct perf_evsel *pos; + + if (!perf_evsel__is_group_leader(evsel)) + return 0; + + if (evsel->nr_members > 1) + printed += fprintf(fp, "%s{", evsel->group_name ?: ""); + + printed += fprintf(fp, "%s", perf_evsel__name(evsel)); + for_each_group_member(pos, evsel) + printed += fprintf(fp, ",%s", perf_evsel__name(pos)); + + if (evsel->nr_members > 1) + printed += fprintf(fp, "}"); + goto out; + } + + printed += fprintf(fp, "%s", perf_evsel__name(evsel)); + + if (details->verbose) { + printed += perf_event_attr__fprintf(fp, &evsel->attr, + __print_attr__fprintf, &first); + } else if (details->freq) { + const char *term = "sample_freq"; + + if (!evsel->attr.freq) + term = "sample_period"; + + printed += comma_fprintf(fp, &first, " %s=%" PRIu64, + term, (u64)evsel->attr.sample_freq); + } + + if (details->trace_fields) { + struct format_field *field; + + if (evsel->attr.type != PERF_TYPE_TRACEPOINT) { + printed += comma_fprintf(fp, &first, " (not a tracepoint)"); + goto out; + } + + field = evsel->tp_format->format.fields; + if (field == NULL) { + printed += comma_fprintf(fp, &first, " (no trace field)"); + goto out; + } + + printed += comma_fprintf(fp, &first, " trace_fields: %s", field->name); + + field = field->next; + while (field) { + printed += comma_fprintf(fp, &first, "%s", field->name); + field = field->next; + } + } +out: + fputc('\n', fp); + return ++printed; +} + +int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment, + unsigned int print_opts, struct callchain_cursor *cursor, + FILE *fp) +{ + int printed = 0; + struct callchain_cursor_node *node; + int print_ip = print_opts & EVSEL__PRINT_IP; + int print_sym = print_opts & EVSEL__PRINT_SYM; + int print_dso = print_opts & EVSEL__PRINT_DSO; + int print_symoffset = print_opts & EVSEL__PRINT_SYMOFFSET; + int print_oneline = print_opts & EVSEL__PRINT_ONELINE; + int print_srcline = print_opts & EVSEL__PRINT_SRCLINE; + int print_unknown_as_addr = print_opts & EVSEL__PRINT_UNKNOWN_AS_ADDR; + char s = print_oneline ? ' ' : '\t'; + + if (sample->callchain) { + struct addr_location node_al; + + callchain_cursor_commit(cursor); + + while (1) { + u64 addr = 0; + + node = callchain_cursor_current(cursor); + if (!node) + break; + + if (node->sym && node->sym->ignore) + goto next; + + printed += fprintf(fp, "%-*.*s", left_alignment, left_alignment, " "); + + if (print_ip) + printed += fprintf(fp, "%c%16" PRIx64, s, node->ip); + + if (node->map) + addr = node->map->map_ip(node->map, node->ip); + + if (print_sym) { + printed += fprintf(fp, " "); + node_al.addr = addr; + node_al.map = node->map; + + if (print_symoffset) { + printed += __symbol__fprintf_symname_offs(node->sym, &node_al, + print_unknown_as_addr, fp); + } else { + printed += __symbol__fprintf_symname(node->sym, &node_al, + print_unknown_as_addr, fp); + } + } + + if (print_dso) { + printed += fprintf(fp, " ("); + printed += map__fprintf_dsoname(node->map, fp); + printed += fprintf(fp, ")"); + } + + if (print_srcline) + printed += map__fprintf_srcline(node->map, addr, "\n ", fp); + + if (!print_oneline) + printed += fprintf(fp, "\n"); +next: + callchain_cursor_advance(cursor); + } + } + + return printed; +} + +int sample__fprintf_sym(struct perf_sample *sample, struct addr_location *al, + int left_alignment, unsigned int print_opts, + struct callchain_cursor *cursor, FILE *fp) +{ + int printed = 0; + int print_ip = print_opts & EVSEL__PRINT_IP; + int print_sym = print_opts & EVSEL__PRINT_SYM; + int print_dso = print_opts & EVSEL__PRINT_DSO; + int print_symoffset = print_opts & EVSEL__PRINT_SYMOFFSET; + int print_srcline = print_opts & EVSEL__PRINT_SRCLINE; + int print_unknown_as_addr = print_opts & EVSEL__PRINT_UNKNOWN_AS_ADDR; + + if (cursor != NULL) { + printed += sample__fprintf_callchain(sample, left_alignment, + print_opts, cursor, fp); + } else if (!(al->sym && al->sym->ignore)) { + printed += fprintf(fp, "%-*.*s", left_alignment, left_alignment, " "); + + if (print_ip) + printed += fprintf(fp, "%16" PRIx64, sample->ip); + + if (print_sym) { + printed += fprintf(fp, " "); + if (print_symoffset) { + printed += __symbol__fprintf_symname_offs(al->sym, al, + print_unknown_as_addr, fp); + } else { + printed += __symbol__fprintf_symname(al->sym, al, + print_unknown_as_addr, fp); + } + } + + if (print_dso) { + printed += fprintf(fp, " ("); + printed += map__fprintf_dsoname(al->map, fp); + printed += fprintf(fp, ")"); + } + + if (print_srcline) + printed += map__fprintf_srcline(al->map, al->addr, "\n ", fp); + } + + return printed; +} diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 90680ec9f8b8..08852dde1378 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1474,7 +1474,7 @@ static int __event_process_build_id(struct build_id_event *bev, dso = machine__findnew_dso(machine, filename); if (dso != NULL) { - char sbuild_id[BUILD_ID_SIZE * 2 + 1]; + char sbuild_id[SBUILD_ID_SIZE]; dso__set_build_id(dso, &bev->build_id); @@ -1819,7 +1819,8 @@ static int process_cpu_topology(struct perf_file_section *section, ph->env.nr_sibling_cores = nr; size += sizeof(u32); - strbuf_init(&sb, 128); + if (strbuf_init(&sb, 128) < 0) + goto free_cpu; for (i = 0; i < nr; i++) { str = do_read_string(fd, ph); @@ -1827,7 +1828,8 @@ static int process_cpu_topology(struct perf_file_section *section, goto error; /* include a NULL character at the end */ - strbuf_add(&sb, str, strlen(str) + 1); + if (strbuf_add(&sb, str, strlen(str) + 1) < 0) + goto error; size += string_size(str); free(str); } @@ -1849,7 +1851,8 @@ static int process_cpu_topology(struct perf_file_section *section, goto error; /* include a NULL character at the end */ - strbuf_add(&sb, str, strlen(str) + 1); + if (strbuf_add(&sb, str, strlen(str) + 1) < 0) + goto error; size += string_size(str); free(str); } @@ -1912,13 +1915,14 @@ static int process_numa_topology(struct perf_file_section *section __maybe_unuse /* nr nodes */ ret = readn(fd, &nr, sizeof(nr)); if (ret != sizeof(nr)) - goto error; + return -1; if (ph->needs_swap) nr = bswap_32(nr); ph->env.nr_numa_nodes = nr; - strbuf_init(&sb, 256); + if (strbuf_init(&sb, 256) < 0) + return -1; for (i = 0; i < nr; i++) { /* node number */ @@ -1940,15 +1944,17 @@ static int process_numa_topology(struct perf_file_section *section __maybe_unuse mem_free = bswap_64(mem_free); } - strbuf_addf(&sb, "%u:%"PRIu64":%"PRIu64":", - node, mem_total, mem_free); + if (strbuf_addf(&sb, "%u:%"PRIu64":%"PRIu64":", + node, mem_total, mem_free) < 0) + goto error; str = do_read_string(fd, ph); if (!str) goto error; /* include a NULL character at the end */ - strbuf_add(&sb, str, strlen(str) + 1); + if (strbuf_add(&sb, str, strlen(str) + 1) < 0) + goto error; free(str); } ph->env.numa_nodes = strbuf_detach(&sb, NULL); @@ -1982,7 +1988,8 @@ static int process_pmu_mappings(struct perf_file_section *section __maybe_unused } ph->env.nr_pmu_mappings = pmu_num; - strbuf_init(&sb, 128); + if (strbuf_init(&sb, 128) < 0) + return -1; while (pmu_num) { if (readn(fd, &type, sizeof(type)) != sizeof(type)) @@ -1994,9 +2001,11 @@ static int process_pmu_mappings(struct perf_file_section *section __maybe_unused if (!name) goto error; - strbuf_addf(&sb, "%u:%s", type, name); + if (strbuf_addf(&sb, "%u:%s", type, name) < 0) + goto error; /* include a NULL character at the end */ - strbuf_add(&sb, "", 1); + if (strbuf_add(&sb, "", 1) < 0) + goto error; if (!strcmp(name, "msr")) ph->env.msr_pmu_type = type; diff --git a/tools/perf/util/help-unknown-cmd.c b/tools/perf/util/help-unknown-cmd.c index 43a98a4dc1e1..d62ccaeeadd6 100644 --- a/tools/perf/util/help-unknown-cmd.c +++ b/tools/perf/util/help-unknown-cmd.c @@ -27,16 +27,27 @@ static int levenshtein_compare(const void *p1, const void *p2) return l1 != l2 ? l1 - l2 : strcmp(s1, s2); } -static void add_cmd_list(struct cmdnames *cmds, struct cmdnames *old) +static int add_cmd_list(struct cmdnames *cmds, struct cmdnames *old) { - unsigned int i; - - ALLOC_GROW(cmds->names, cmds->cnt + old->cnt, cmds->alloc); - + unsigned int i, nr = cmds->cnt + old->cnt; + void *tmp; + + if (nr > cmds->alloc) { + /* Choose bigger one to alloc */ + if (alloc_nr(cmds->alloc) < nr) + cmds->alloc = nr; + else + cmds->alloc = alloc_nr(cmds->alloc); + tmp = realloc(cmds->names, cmds->alloc * sizeof(*cmds->names)); + if (!tmp) + return -1; + cmds->names = tmp; + } for (i = 0; i < old->cnt; i++) cmds->names[cmds->cnt++] = old->names[i]; zfree(&old->names); old->cnt = 0; + return 0; } const char *help_unknown_cmd(const char *cmd) @@ -52,8 +63,11 @@ const char *help_unknown_cmd(const char *cmd) load_command_list("perf-", &main_cmds, &other_cmds); - add_cmd_list(&main_cmds, &aliases); - add_cmd_list(&main_cmds, &other_cmds); + if (add_cmd_list(&main_cmds, &aliases) < 0 || + add_cmd_list(&main_cmds, &other_cmds) < 0) { + fprintf(stderr, "ERROR: Failed to allocate command list for unknown command.\n"); + goto end; + } qsort(main_cmds.names, main_cmds.cnt, sizeof(main_cmds.names), cmdname_compare); uniq(&main_cmds); @@ -99,6 +113,6 @@ const char *help_unknown_cmd(const char *cmd) for (i = 0; i < n; i++) fprintf(stderr, "\t%s\n", main_cmds.names[i]->name); } - +end: exit(1); } diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 31c4641fe5ff..cfab531437c7 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -295,7 +295,7 @@ static void hists__delete_entry(struct hists *hists, struct hist_entry *he) root_in = &he->parent_he->hroot_in; root_out = &he->parent_he->hroot_out; } else { - if (sort__need_collapse) + if (hists__has(hists, need_collapse)) root_in = &hists->entries_collapsed; else root_in = hists->entries_in; @@ -953,7 +953,7 @@ int hist_entry_iter__add(struct hist_entry_iter *iter, struct addr_location *al, { int err, err2; - err = sample__resolve_callchain(iter->sample, &iter->parent, + err = sample__resolve_callchain(iter->sample, &callchain_cursor, &iter->parent, iter->evsel, al, max_stack_depth); if (err) return err; @@ -1295,8 +1295,9 @@ static int hists__hierarchy_insert_entry(struct hists *hists, return ret; } -int hists__collapse_insert_entry(struct hists *hists, struct rb_root *root, - struct hist_entry *he) +static int hists__collapse_insert_entry(struct hists *hists, + struct rb_root *root, + struct hist_entry *he) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; @@ -1372,7 +1373,7 @@ int hists__collapse_resort(struct hists *hists, struct ui_progress *prog) struct hist_entry *n; int ret; - if (!sort__need_collapse) + if (!hists__has(hists, need_collapse)) return 0; hists->nr_entries = 0; @@ -1631,7 +1632,7 @@ static void output_resort(struct hists *hists, struct ui_progress *prog, return; } - if (sort__need_collapse) + if (hists__has(hists, need_collapse)) root = &hists->entries_collapsed; else root = hists->entries_in; @@ -2035,7 +2036,7 @@ static struct hist_entry *hists__add_dummy_entry(struct hists *hists, struct hist_entry *he; int64_t cmp; - if (sort__need_collapse) + if (hists__has(hists, need_collapse)) root = &hists->entries_collapsed; else root = hists->entries_in; @@ -2061,6 +2062,8 @@ static struct hist_entry *hists__add_dummy_entry(struct hists *hists, if (he) { memset(&he->stat, 0, sizeof(he->stat)); he->hists = hists; + if (symbol_conf.cumulate_callchain) + memset(he->stat_acc, 0, sizeof(he->stat)); rb_link_node(&he->rb_node_in, parent, p); rb_insert_color(&he->rb_node_in, root); hists__inc_stats(hists, he); @@ -2075,7 +2078,7 @@ static struct hist_entry *hists__find_entry(struct hists *hists, { struct rb_node *n; - if (sort__need_collapse) + if (hists__has(hists, need_collapse)) n = hists->entries_collapsed.rb_node; else n = hists->entries_in->rb_node; @@ -2104,7 +2107,7 @@ void hists__match(struct hists *leader, struct hists *other) struct rb_node *nd; struct hist_entry *pos, *pair; - if (sort__need_collapse) + if (hists__has(leader, need_collapse)) root = &leader->entries_collapsed; else root = leader->entries_in; @@ -2129,7 +2132,7 @@ int hists__link(struct hists *leader, struct hists *other) struct rb_node *nd; struct hist_entry *pos, *pair; - if (sort__need_collapse) + if (hists__has(other, need_collapse)) root = &other->entries_collapsed; else root = other->entries_in; diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index bec0cd660fbd..0f84bfb42bb1 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -82,6 +82,8 @@ struct hists { int nr_hpp_node; }; +#define hists__has(__h, __f) (__h)->hpp_list->__f + struct hist_entry_iter; struct hist_iter_ops { @@ -199,8 +201,6 @@ int hists__init(void); int __hists__init(struct hists *hists, struct perf_hpp_list *hpp_list); struct rb_root *hists__get_rotate_entries_in(struct hists *hists); -int hists__collapse_insert_entry(struct hists *hists, - struct rb_root *root, struct hist_entry *he); struct perf_hpp { char *buf; @@ -240,6 +240,14 @@ struct perf_hpp_fmt { struct perf_hpp_list { struct list_head fields; struct list_head sorts; + + int need_collapse; + int parent; + int sym; + int dso; + int socket; + int thread; + int comm; }; extern struct perf_hpp_list perf_hpp_list; diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c index abf1366e2a24..9df996085563 100644 --- a/tools/perf/util/intel-bts.c +++ b/tools/perf/util/intel-bts.c @@ -66,6 +66,7 @@ struct intel_bts { u64 branches_id; size_t branches_event_size; bool synth_needs_swap; + unsigned long num_events; }; struct intel_bts_queue { @@ -275,6 +276,10 @@ static int intel_bts_synth_branch_sample(struct intel_bts_queue *btsq, union perf_event event; struct perf_sample sample = { .ip = 0, }; + if (bts->synth_opts.initial_skip && + bts->num_events++ <= bts->synth_opts.initial_skip) + return 0; + event.sample.header.type = PERF_RECORD_SAMPLE; event.sample.header.misc = PERF_RECORD_MISC_USER; event.sample.header.size = sizeof(struct perf_event_header); diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c index 9409d014b46c..9c8f15da86ce 100644 --- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c +++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c @@ -356,7 +356,7 @@ static const char *intel_pt_err_msgs[] = { int intel_pt__strerror(int code, char *buf, size_t buflen) { - if (code < 1 || code > INTEL_PT_ERR_MAX) + if (code < 1 || code >= INTEL_PT_ERR_MAX) code = INTEL_PT_ERR_UNK; strlcpy(buf, intel_pt_err_msgs[code], buflen); return 0; diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index 617578440989..137196990012 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -100,6 +100,8 @@ struct intel_pt { u64 cyc_bit; u64 noretcomp_bit; unsigned max_non_turbo_ratio; + + unsigned long num_events; }; enum switch_state { @@ -972,6 +974,10 @@ static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq) if (pt->branches_filter && !(pt->branches_filter & ptq->flags)) return 0; + if (pt->synth_opts.initial_skip && + pt->num_events++ < pt->synth_opts.initial_skip) + return 0; + event->sample.header.type = PERF_RECORD_SAMPLE; event->sample.header.misc = PERF_RECORD_MISC_USER; event->sample.header.size = sizeof(struct perf_event_header); @@ -1029,6 +1035,10 @@ static int intel_pt_synth_instruction_sample(struct intel_pt_queue *ptq) union perf_event *event = ptq->event_buf; struct perf_sample sample = { .ip = 0, }; + if (pt->synth_opts.initial_skip && + pt->num_events++ < pt->synth_opts.initial_skip) + return 0; + event->sample.header.type = PERF_RECORD_SAMPLE; event->sample.header.misc = PERF_RECORD_MISC_USER; event->sample.header.size = sizeof(struct perf_event_header); @@ -1087,6 +1097,10 @@ static int intel_pt_synth_transaction_sample(struct intel_pt_queue *ptq) union perf_event *event = ptq->event_buf; struct perf_sample sample = { .ip = 0, }; + if (pt->synth_opts.initial_skip && + pt->num_events++ < pt->synth_opts.initial_skip) + return 0; + event->sample.header.type = PERF_RECORD_SAMPLE; event->sample.header.misc = PERF_RECORD_MISC_USER; event->sample.header.size = sizeof(struct perf_event_header); @@ -1199,14 +1213,18 @@ static int intel_pt_sample(struct intel_pt_queue *ptq) ptq->have_sample = false; if (pt->sample_instructions && - (state->type & INTEL_PT_INSTRUCTION)) { + (state->type & INTEL_PT_INSTRUCTION) && + (!pt->synth_opts.initial_skip || + pt->num_events++ >= pt->synth_opts.initial_skip)) { err = intel_pt_synth_instruction_sample(ptq); if (err) return err; } if (pt->sample_transactions && - (state->type & INTEL_PT_TRANSACTION)) { + (state->type & INTEL_PT_TRANSACTION) && + (!pt->synth_opts.initial_skip || + pt->num_events++ >= pt->synth_opts.initial_skip)) { err = intel_pt_synth_transaction_sample(ptq); if (err) return err; diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c index ad0c0bb1fbc7..86afe9618bb0 100644 --- a/tools/perf/util/jitdump.c +++ b/tools/perf/util/jitdump.c @@ -17,6 +17,7 @@ #include "strlist.h" #include <elf.h> +#include "tsc.h" #include "session.h" #include "jit.h" #include "jitdump.h" @@ -33,6 +34,7 @@ struct jit_buf_desc { size_t bufsize; FILE *in; bool needs_bswap; /* handles cross-endianess */ + bool use_arch_timestamp; void *debug_data; size_t nr_debug_entries; uint32_t code_load_count; @@ -158,13 +160,16 @@ jit_open(struct jit_buf_desc *jd, const char *name) header.flags = bswap_64(header.flags); } + jd->use_arch_timestamp = header.flags & JITDUMP_FLAGS_ARCH_TIMESTAMP; + if (verbose > 2) - pr_debug("version=%u\nhdr.size=%u\nts=0x%llx\npid=%d\nelf_mach=%d\n", + pr_debug("version=%u\nhdr.size=%u\nts=0x%llx\npid=%d\nelf_mach=%d\nuse_arch_timestamp=%d\n", header.version, header.total_size, (unsigned long long)header.timestamp, header.pid, - header.elf_mach); + header.elf_mach, + jd->use_arch_timestamp); if (header.flags & JITDUMP_FLAGS_RESERVED) { pr_err("jitdump file contains invalid or unsupported flags 0x%llx\n", @@ -172,10 +177,15 @@ jit_open(struct jit_buf_desc *jd, const char *name) goto error; } + if (jd->use_arch_timestamp && !jd->session->time_conv.time_mult) { + pr_err("jitdump file uses arch timestamps but there is no timestamp conversion\n"); + goto error; + } + /* * validate event is using the correct clockid */ - if (jit_validate_events(jd->session)) { + if (!jd->use_arch_timestamp && jit_validate_events(jd->session)) { pr_err("error, jitted code must be sampled with perf record -k 1\n"); goto error; } @@ -329,6 +339,23 @@ jit_inject_event(struct jit_buf_desc *jd, union perf_event *event) return 0; } +static uint64_t convert_timestamp(struct jit_buf_desc *jd, uint64_t timestamp) +{ + struct perf_tsc_conversion tc; + + if (!jd->use_arch_timestamp) + return timestamp; + + tc.time_shift = jd->session->time_conv.time_shift; + tc.time_mult = jd->session->time_conv.time_mult; + tc.time_zero = jd->session->time_conv.time_zero; + + if (!tc.time_mult) + return 0; + + return tsc_to_perf_time(timestamp, &tc); +} + static int jit_repipe_code_load(struct jit_buf_desc *jd, union jr_entry *jr) { struct perf_sample sample; @@ -385,7 +412,7 @@ static int jit_repipe_code_load(struct jit_buf_desc *jd, union jr_entry *jr) return -1; } if (stat(filename, &st)) - memset(&st, 0, sizeof(stat)); + memset(&st, 0, sizeof(st)); event->mmap2.header.type = PERF_RECORD_MMAP2; event->mmap2.header.misc = PERF_RECORD_MISC_USER; @@ -410,7 +437,7 @@ static int jit_repipe_code_load(struct jit_buf_desc *jd, union jr_entry *jr) id->tid = tid; } if (jd->sample_type & PERF_SAMPLE_TIME) - id->time = jr->load.p.timestamp; + id->time = convert_timestamp(jd, jr->load.p.timestamp); /* * create pseudo sample to induce dso hit increment @@ -473,7 +500,7 @@ static int jit_repipe_code_move(struct jit_buf_desc *jd, union jr_entry *jr) size++; /* for \0 */ if (stat(filename, &st)) - memset(&st, 0, sizeof(stat)); + memset(&st, 0, sizeof(st)); size = PERF_ALIGN(size, sizeof(u64)); @@ -499,7 +526,7 @@ static int jit_repipe_code_move(struct jit_buf_desc *jd, union jr_entry *jr) id->tid = tid; } if (jd->sample_type & PERF_SAMPLE_TIME) - id->time = jr->load.p.timestamp; + id->time = convert_timestamp(jd, jr->load.p.timestamp); /* * create pseudo sample to induce dso hit increment diff --git a/tools/perf/util/jitdump.h b/tools/perf/util/jitdump.h index b66c1f503d9e..bcacd20d0c1c 100644 --- a/tools/perf/util/jitdump.h +++ b/tools/perf/util/jitdump.h @@ -23,9 +23,12 @@ #define JITHEADER_VERSION 1 enum jitdump_flags_bits { + JITDUMP_FLAGS_ARCH_TIMESTAMP_BIT, JITDUMP_FLAGS_MAX_BIT, }; +#define JITDUMP_FLAGS_ARCH_TIMESTAMP (1ULL << JITDUMP_FLAGS_ARCH_TIMESTAMP_BIT) + #define JITDUMP_FLAGS_RESERVED (JITDUMP_FLAGS_MAX_BIT < 64 ? \ (~((1ULL << JITDUMP_FLAGS_MAX_BIT) - 1)) : 0) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 80b9b6a87990..639a2903065e 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -32,6 +32,7 @@ int machine__init(struct machine *machine, const char *root_dir, pid_t pid) machine->threads = RB_ROOT; pthread_rwlock_init(&machine->threads_lock, NULL); + machine->nr_threads = 0; INIT_LIST_HEAD(&machine->dead_threads); machine->last_match = NULL; @@ -430,6 +431,7 @@ static struct thread *____machine__findnew_thread(struct machine *machine, */ thread__get(th); machine->last_match = th; + ++machine->nr_threads; } return th; @@ -681,11 +683,13 @@ size_t machine__fprintf_vmlinux_path(struct machine *machine, FILE *fp) size_t machine__fprintf(struct machine *machine, FILE *fp) { - size_t ret = 0; + size_t ret; struct rb_node *nd; pthread_rwlock_rdlock(&machine->threads_lock); + ret = fprintf(fp, "Threads: %u\n", machine->nr_threads); + for (nd = rb_first(&machine->threads); nd; nd = rb_next(nd)) { struct thread *pos = rb_entry(nd, struct thread, rb_node); @@ -908,11 +912,11 @@ int machines__create_kernel_maps(struct machines *machines, pid_t pid) return machine__create_kernel_maps(machine); } -int machine__load_kallsyms(struct machine *machine, const char *filename, - enum map_type type, symbol_filter_t filter) +int __machine__load_kallsyms(struct machine *machine, const char *filename, + enum map_type type, bool no_kcore, symbol_filter_t filter) { struct map *map = machine__kernel_map(machine); - int ret = dso__load_kallsyms(map->dso, filename, map, filter); + int ret = __dso__load_kallsyms(map->dso, filename, map, no_kcore, filter); if (ret > 0) { dso__set_loaded(map->dso, type); @@ -927,6 +931,12 @@ int machine__load_kallsyms(struct machine *machine, const char *filename, return ret; } +int machine__load_kallsyms(struct machine *machine, const char *filename, + enum map_type type, symbol_filter_t filter) +{ + return __machine__load_kallsyms(machine, filename, type, false, filter); +} + int machine__load_vmlinux_path(struct machine *machine, enum map_type type, symbol_filter_t filter) { @@ -1413,6 +1423,7 @@ static void __machine__remove_thread(struct machine *machine, struct thread *th, pthread_rwlock_wrlock(&machine->threads_lock); rb_erase_init(&th->rb_node, &machine->threads); RB_CLEAR_NODE(&th->rb_node); + --machine->nr_threads; /* * Move it first to the dead_threads list, then drop the reference, * if this is the last reference, then the thread__delete destructor @@ -1599,6 +1610,7 @@ struct mem_info *sample__resolve_mem(struct perf_sample *sample, } static int add_callchain_ip(struct thread *thread, + struct callchain_cursor *cursor, struct symbol **parent, struct addr_location *root_al, u8 *cpumode, @@ -1630,7 +1642,7 @@ static int add_callchain_ip(struct thread *thread, * It seems the callchain is corrupted. * Discard all. */ - callchain_cursor_reset(&callchain_cursor); + callchain_cursor_reset(cursor); return 1; } return 0; @@ -1640,7 +1652,7 @@ static int add_callchain_ip(struct thread *thread, } if (al.sym != NULL) { - if (sort__has_parent && !*parent && + if (perf_hpp_list.parent && !*parent && symbol__match_regex(al.sym, &parent_regex)) *parent = al.sym; else if (have_ignore_callees && root_al && @@ -1648,13 +1660,13 @@ static int add_callchain_ip(struct thread *thread, /* Treat this symbol as the root, forgetting its callees. */ *root_al = al; - callchain_cursor_reset(&callchain_cursor); + callchain_cursor_reset(cursor); } } if (symbol_conf.hide_unresolved && al.sym == NULL) return 0; - return callchain_cursor_append(&callchain_cursor, al.addr, al.map, al.sym); + return callchain_cursor_append(cursor, al.addr, al.map, al.sym); } struct branch_info *sample__resolve_bstack(struct perf_sample *sample, @@ -1724,6 +1736,7 @@ static int remove_loops(struct branch_entry *l, int nr) * negative error code on other errors. */ static int resolve_lbr_callchain_sample(struct thread *thread, + struct callchain_cursor *cursor, struct perf_sample *sample, struct symbol **parent, struct addr_location *root_al, @@ -1756,7 +1769,7 @@ static int resolve_lbr_callchain_sample(struct thread *thread, */ int mix_chain_nr = i + 1 + lbr_nr + 1; - if (mix_chain_nr > PERF_MAX_STACK_DEPTH + PERF_MAX_BRANCH_DEPTH) { + if (mix_chain_nr > (int)sysctl_perf_event_max_stack + PERF_MAX_BRANCH_DEPTH) { pr_warning("corrupted callchain. skipping...\n"); return 0; } @@ -1778,7 +1791,7 @@ static int resolve_lbr_callchain_sample(struct thread *thread, ip = lbr_stack->entries[0].to; } - err = add_callchain_ip(thread, parent, root_al, &cpumode, ip); + err = add_callchain_ip(thread, cursor, parent, root_al, &cpumode, ip); if (err) return (err < 0) ? err : 0; } @@ -1789,6 +1802,7 @@ static int resolve_lbr_callchain_sample(struct thread *thread, } static int thread__resolve_callchain_sample(struct thread *thread, + struct callchain_cursor *cursor, struct perf_evsel *evsel, struct perf_sample *sample, struct symbol **parent, @@ -1803,10 +1817,8 @@ static int thread__resolve_callchain_sample(struct thread *thread, int skip_idx = -1; int first_call = 0; - callchain_cursor_reset(&callchain_cursor); - - if (has_branch_callstack(evsel)) { - err = resolve_lbr_callchain_sample(thread, sample, parent, + if (perf_evsel__has_branch_callstack(evsel)) { + err = resolve_lbr_callchain_sample(thread, cursor, sample, parent, root_al, max_stack); if (err) return (err < 0) ? err : 0; @@ -1816,7 +1828,7 @@ static int thread__resolve_callchain_sample(struct thread *thread, * Based on DWARF debug information, some architectures skip * a callchain entry saved by the kernel. */ - if (chain->nr < PERF_MAX_STACK_DEPTH) + if (chain->nr < sysctl_perf_event_max_stack) skip_idx = arch_skip_callchain_idx(thread, chain); /* @@ -1863,10 +1875,10 @@ static int thread__resolve_callchain_sample(struct thread *thread, nr = remove_loops(be, nr); for (i = 0; i < nr; i++) { - err = add_callchain_ip(thread, parent, root_al, + err = add_callchain_ip(thread, cursor, parent, root_al, NULL, be[i].to); if (!err) - err = add_callchain_ip(thread, parent, root_al, + err = add_callchain_ip(thread, cursor, parent, root_al, NULL, be[i].from); if (err == -EINVAL) break; @@ -1877,7 +1889,7 @@ static int thread__resolve_callchain_sample(struct thread *thread, } check_calls: - if (chain->nr > PERF_MAX_STACK_DEPTH && (int)chain->nr > max_stack) { + if (chain->nr > sysctl_perf_event_max_stack && (int)chain->nr > max_stack) { pr_warning("corrupted callchain. skipping...\n"); return 0; } @@ -1896,7 +1908,7 @@ check_calls: #endif ip = chain->ips[j]; - err = add_callchain_ip(thread, parent, root_al, &cpumode, ip); + err = add_callchain_ip(thread, cursor, parent, root_al, &cpumode, ip); if (err) return (err < 0) ? err : 0; @@ -1915,19 +1927,12 @@ static int unwind_entry(struct unwind_entry *entry, void *arg) entry->map, entry->sym); } -int thread__resolve_callchain(struct thread *thread, - struct perf_evsel *evsel, - struct perf_sample *sample, - struct symbol **parent, - struct addr_location *root_al, - int max_stack) +static int thread__resolve_callchain_unwind(struct thread *thread, + struct callchain_cursor *cursor, + struct perf_evsel *evsel, + struct perf_sample *sample, + int max_stack) { - int ret = thread__resolve_callchain_sample(thread, evsel, - sample, parent, - root_al, max_stack); - if (ret) - return ret; - /* Can we do dwarf post unwind? */ if (!((evsel->attr.sample_type & PERF_SAMPLE_REGS_USER) && (evsel->attr.sample_type & PERF_SAMPLE_STACK_USER))) @@ -1938,9 +1943,45 @@ int thread__resolve_callchain(struct thread *thread, (!sample->user_stack.size)) return 0; - return unwind__get_entries(unwind_entry, &callchain_cursor, + return unwind__get_entries(unwind_entry, cursor, thread, sample, max_stack); +} + +int thread__resolve_callchain(struct thread *thread, + struct callchain_cursor *cursor, + struct perf_evsel *evsel, + struct perf_sample *sample, + struct symbol **parent, + struct addr_location *root_al, + int max_stack) +{ + int ret = 0; + + callchain_cursor_reset(&callchain_cursor); + if (callchain_param.order == ORDER_CALLEE) { + ret = thread__resolve_callchain_sample(thread, cursor, + evsel, sample, + parent, root_al, + max_stack); + if (ret) + return ret; + ret = thread__resolve_callchain_unwind(thread, cursor, + evsel, sample, + max_stack); + } else { + ret = thread__resolve_callchain_unwind(thread, cursor, + evsel, sample, + max_stack); + if (ret) + return ret; + ret = thread__resolve_callchain_sample(thread, cursor, + evsel, sample, + parent, root_al, + max_stack); + } + + return ret; } int machine__for_each_thread(struct machine *machine, diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h index 8499db281158..83f46790c52f 100644 --- a/tools/perf/util/machine.h +++ b/tools/perf/util/machine.h @@ -31,6 +31,7 @@ struct machine { char *root_dir; struct rb_root threads; pthread_rwlock_t threads_lock; + unsigned int nr_threads; struct list_head dead_threads; struct thread *last_match; struct vdso_info *vdso_info; @@ -141,7 +142,11 @@ struct branch_info *sample__resolve_bstack(struct perf_sample *sample, struct addr_location *al); struct mem_info *sample__resolve_mem(struct perf_sample *sample, struct addr_location *al); + +struct callchain_cursor; + int thread__resolve_callchain(struct thread *thread, + struct callchain_cursor *cursor, struct perf_evsel *evsel, struct perf_sample *sample, struct symbol **parent, @@ -211,6 +216,8 @@ struct symbol *machine__find_kernel_function_by_name(struct machine *machine, struct map *machine__findnew_module_map(struct machine *machine, u64 start, const char *filename); +int __machine__load_kallsyms(struct machine *machine, const char *filename, + enum map_type type, bool no_kcore, symbol_filter_t filter); int machine__load_kallsyms(struct machine *machine, const char *filename, enum map_type type, symbol_filter_t filter); int machine__load_vmlinux_path(struct machine *machine, enum map_type type, diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index 171b6d10a04b..b19bcd3b7128 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -289,7 +289,7 @@ int map__load(struct map *map, symbol_filter_t filter) nr = dso__load(map->dso, map, filter); if (nr < 0) { if (map->dso->has_build_id) { - char sbuild_id[BUILD_ID_SIZE * 2 + 1]; + char sbuild_id[SBUILD_ID_SIZE]; build_id__sprintf(map->dso->build_id, sizeof(map->dso->build_id), @@ -431,6 +431,13 @@ u64 map__rip_2objdump(struct map *map, u64 rip) if (map->dso->rel) return rip - map->pgoff; + /* + * kernel modules also have DSO_TYPE_USER in dso->kernel, + * but all kernel modules are ET_REL, so won't get here. + */ + if (map->dso->kernel == DSO_TYPE_USER) + return rip + map->dso->text_offset; + return map->unmap_ip(map, rip) - map->reloc; } @@ -454,6 +461,13 @@ u64 map__objdump_2mem(struct map *map, u64 ip) if (map->dso->rel) return map->unmap_ip(map, ip + map->pgoff); + /* + * kernel modules also have DSO_TYPE_USER in dso->kernel, + * but all kernel modules are ET_REL, so won't get here. + */ + if (map->dso->kernel == DSO_TYPE_USER) + return map->unmap_ip(map, ip - map->dso->text_offset); + return ip + map->reloc; } diff --git a/tools/perf/util/ordered-events.c b/tools/perf/util/ordered-events.c index b1b9e2385f4b..fe84df1875aa 100644 --- a/tools/perf/util/ordered-events.c +++ b/tools/perf/util/ordered-events.c @@ -308,3 +308,12 @@ void ordered_events__free(struct ordered_events *oe) free(event); } } + +void ordered_events__reinit(struct ordered_events *oe) +{ + ordered_events__deliver_t old_deliver = oe->deliver; + + ordered_events__free(oe); + memset(oe, '\0', sizeof(*oe)); + ordered_events__init(oe, old_deliver); +} diff --git a/tools/perf/util/ordered-events.h b/tools/perf/util/ordered-events.h index f403991e3bfd..e11468a9a6e4 100644 --- a/tools/perf/util/ordered-events.h +++ b/tools/perf/util/ordered-events.h @@ -49,6 +49,7 @@ void ordered_events__delete(struct ordered_events *oe, struct ordered_event *eve int ordered_events__flush(struct ordered_events *oe, enum oe_flush how); void ordered_events__init(struct ordered_events *oe, ordered_events__deliver_t deliver); void ordered_events__free(struct ordered_events *oe); +void ordered_events__reinit(struct ordered_events *oe); static inline void ordered_events__set_alloc_size(struct ordered_events *oe, u64 size) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 4c19d5e79d8c..bcbc983d4b12 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -138,11 +138,11 @@ struct event_symbol event_symbols_sw[PERF_COUNT_SW_MAX] = { #define PERF_EVENT_TYPE(config) __PERF_EVENT_FIELD(config, TYPE) #define PERF_EVENT_ID(config) __PERF_EVENT_FIELD(config, EVENT) -#define for_each_subsystem(sys_dir, sys_dirent, sys_next) \ - while (!readdir_r(sys_dir, &sys_dirent, &sys_next) && sys_next) \ - if (sys_dirent.d_type == DT_DIR && \ - (strcmp(sys_dirent.d_name, ".")) && \ - (strcmp(sys_dirent.d_name, ".."))) +#define for_each_subsystem(sys_dir, sys_dirent) \ + while ((sys_dirent = readdir(sys_dir)) != NULL) \ + if (sys_dirent->d_type == DT_DIR && \ + (strcmp(sys_dirent->d_name, ".")) && \ + (strcmp(sys_dirent->d_name, ".."))) static int tp_event_has_id(struct dirent *sys_dir, struct dirent *evt_dir) { @@ -159,12 +159,12 @@ static int tp_event_has_id(struct dirent *sys_dir, struct dirent *evt_dir) return 0; } -#define for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) \ - while (!readdir_r(evt_dir, &evt_dirent, &evt_next) && evt_next) \ - if (evt_dirent.d_type == DT_DIR && \ - (strcmp(evt_dirent.d_name, ".")) && \ - (strcmp(evt_dirent.d_name, "..")) && \ - (!tp_event_has_id(&sys_dirent, &evt_dirent))) +#define for_each_event(sys_dirent, evt_dir, evt_dirent) \ + while ((evt_dirent = readdir(evt_dir)) != NULL) \ + if (evt_dirent->d_type == DT_DIR && \ + (strcmp(evt_dirent->d_name, ".")) && \ + (strcmp(evt_dirent->d_name, "..")) && \ + (!tp_event_has_id(sys_dirent, evt_dirent))) #define MAX_EVENT_LENGTH 512 @@ -173,7 +173,7 @@ struct tracepoint_path *tracepoint_id_to_path(u64 config) { struct tracepoint_path *path = NULL; DIR *sys_dir, *evt_dir; - struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent; + struct dirent *sys_dirent, *evt_dirent; char id_buf[24]; int fd; u64 id; @@ -184,18 +184,18 @@ struct tracepoint_path *tracepoint_id_to_path(u64 config) if (!sys_dir) return NULL; - for_each_subsystem(sys_dir, sys_dirent, sys_next) { + for_each_subsystem(sys_dir, sys_dirent) { snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path, - sys_dirent.d_name); + sys_dirent->d_name); evt_dir = opendir(dir_path); if (!evt_dir) continue; - for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) { + for_each_event(sys_dirent, evt_dir, evt_dirent) { snprintf(evt_path, MAXPATHLEN, "%s/%s/id", dir_path, - evt_dirent.d_name); + evt_dirent->d_name); fd = open(evt_path, O_RDONLY); if (fd < 0) continue; @@ -220,9 +220,9 @@ struct tracepoint_path *tracepoint_id_to_path(u64 config) free(path); return NULL; } - strncpy(path->system, sys_dirent.d_name, + strncpy(path->system, sys_dirent->d_name, MAX_EVENT_LENGTH); - strncpy(path->name, evt_dirent.d_name, + strncpy(path->name, evt_dirent->d_name, MAX_EVENT_LENGTH); return path; } @@ -1812,7 +1812,7 @@ void print_tracepoint_events(const char *subsys_glob, const char *event_glob, bool name_only) { DIR *sys_dir, *evt_dir; - struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent; + struct dirent *sys_dirent, *evt_dirent; char evt_path[MAXPATHLEN]; char dir_path[MAXPATHLEN]; char **evt_list = NULL; @@ -1830,20 +1830,20 @@ restart: goto out_close_sys_dir; } - for_each_subsystem(sys_dir, sys_dirent, sys_next) { + for_each_subsystem(sys_dir, sys_dirent) { if (subsys_glob != NULL && - !strglobmatch(sys_dirent.d_name, subsys_glob)) + !strglobmatch(sys_dirent->d_name, subsys_glob)) continue; snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path, - sys_dirent.d_name); + sys_dirent->d_name); evt_dir = opendir(dir_path); if (!evt_dir) continue; - for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) { + for_each_event(sys_dirent, evt_dir, evt_dirent) { if (event_glob != NULL && - !strglobmatch(evt_dirent.d_name, event_glob)) + !strglobmatch(evt_dirent->d_name, event_glob)) continue; if (!evt_num_known) { @@ -1852,7 +1852,7 @@ restart: } snprintf(evt_path, MAXPATHLEN, "%s:%s", - sys_dirent.d_name, evt_dirent.d_name); + sys_dirent->d_name, evt_dirent->d_name); evt_list[evt_i] = strdup(evt_path); if (evt_list[evt_i] == NULL) @@ -1905,7 +1905,7 @@ out_close_sys_dir: int is_valid_tracepoint(const char *event_string) { DIR *sys_dir, *evt_dir; - struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent; + struct dirent *sys_dirent, *evt_dirent; char evt_path[MAXPATHLEN]; char dir_path[MAXPATHLEN]; @@ -1913,17 +1913,17 @@ int is_valid_tracepoint(const char *event_string) if (!sys_dir) return 0; - for_each_subsystem(sys_dir, sys_dirent, sys_next) { + for_each_subsystem(sys_dir, sys_dirent) { snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path, - sys_dirent.d_name); + sys_dirent->d_name); evt_dir = opendir(dir_path); if (!evt_dir) continue; - for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) { + for_each_event(sys_dirent, evt_dir, evt_dirent) { snprintf(evt_path, MAXPATHLEN, "%s:%s", - sys_dirent.d_name, evt_dirent.d_name); + sys_dirent->d_name, evt_dirent->d_name); if (!strcmp(evt_path, event_string)) { closedir(evt_dir); closedir(sys_dir); diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index adef23b1352e..ddb0261b2577 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -602,14 +602,13 @@ static void pmu_format_value(unsigned long *format, __u64 value, __u64 *v, static __u64 pmu_format_max_value(const unsigned long *format) { - int w; + __u64 w = 0; + int fbit; - w = bitmap_weight(format, PERF_PMU_FORMAT_BITS); - if (!w) - return 0; - if (w < 64) - return (1ULL << w) - 1; - return -1; + for_each_set_bit(fbit, format, PERF_PMU_FORMAT_BITS) + w |= (1ULL << fbit); + + return w; } /* @@ -644,20 +643,20 @@ static int pmu_resolve_param_term(struct parse_events_term *term, static char *pmu_formats_string(struct list_head *formats) { struct perf_pmu_format *format; - char *str; - struct strbuf buf; + char *str = NULL; + struct strbuf buf = STRBUF_INIT; unsigned i = 0; if (!formats) return NULL; - strbuf_init(&buf, 0); /* sysfs exported terms */ list_for_each_entry(format, formats, list) - strbuf_addf(&buf, i++ ? ",%s" : "%s", - format->name); + if (strbuf_addf(&buf, i++ ? ",%s" : "%s", format->name) < 0) + goto error; str = strbuf_detach(&buf, NULL); +error: strbuf_release(&buf); return str; diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 8319fbb08636..74401a20106d 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -265,6 +265,65 @@ static bool kprobe_warn_out_range(const char *symbol, unsigned long address) return true; } +/* + * NOTE: + * '.gnu.linkonce.this_module' section of kernel module elf directly + * maps to 'struct module' from linux/module.h. This section contains + * actual module name which will be used by kernel after loading it. + * But, we cannot use 'struct module' here since linux/module.h is not + * exposed to user-space. Offset of 'name' has remained same from long + * time, so hardcoding it here. + */ +#ifdef __LP64__ +#define MOD_NAME_OFFSET 24 +#else +#define MOD_NAME_OFFSET 12 +#endif + +/* + * @module can be module name of module file path. In case of path, + * inspect elf and find out what is actual module name. + * Caller has to free mod_name after using it. + */ +static char *find_module_name(const char *module) +{ + int fd; + Elf *elf; + GElf_Ehdr ehdr; + GElf_Shdr shdr; + Elf_Data *data; + Elf_Scn *sec; + char *mod_name = NULL; + + fd = open(module, O_RDONLY); + if (fd < 0) + return NULL; + + elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL); + if (elf == NULL) + goto elf_err; + + if (gelf_getehdr(elf, &ehdr) == NULL) + goto ret_err; + + sec = elf_section_by_name(elf, &ehdr, &shdr, + ".gnu.linkonce.this_module", NULL); + if (!sec) + goto ret_err; + + data = elf_getdata(sec, NULL); + if (!data || !data->d_buf) + goto ret_err; + + mod_name = strdup((char *)data->d_buf + MOD_NAME_OFFSET); + +ret_err: + elf_end(elf); +elf_err: + close(fd); + return mod_name; +} + #ifdef HAVE_DWARF_SUPPORT static int kernel_get_module_dso(const char *module, struct dso **pdso) @@ -486,8 +545,10 @@ static int get_text_start_address(const char *exec, unsigned long *address) return -errno; elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL); - if (elf == NULL) - return -EINVAL; + if (elf == NULL) { + ret = -EINVAL; + goto out_close; + } if (gelf_getehdr(elf, &ehdr) == NULL) goto out; @@ -499,6 +560,9 @@ static int get_text_start_address(const char *exec, unsigned long *address) ret = 0; out: elf_end(elf); +out_close: + close(fd); + return ret; } @@ -583,32 +647,23 @@ static int add_module_to_probe_trace_events(struct probe_trace_event *tevs, int ntevs, const char *module) { int i, ret = 0; - char *tmp; + char *mod_name = NULL; if (!module) return 0; - tmp = strrchr(module, '/'); - if (tmp) { - /* This is a module path -- get the module name */ - module = strdup(tmp + 1); - if (!module) - return -ENOMEM; - tmp = strchr(module, '.'); - if (tmp) - *tmp = '\0'; - tmp = (char *)module; /* For free() */ - } + mod_name = find_module_name(module); for (i = 0; i < ntevs; i++) { - tevs[i].point.module = strdup(module); + tevs[i].point.module = + strdup(mod_name ? mod_name : module); if (!tevs[i].point.module) { ret = -ENOMEM; break; } } - free(tmp); + free(mod_name); return ret; } @@ -1618,69 +1673,65 @@ out: } /* Compose only probe arg */ -int synthesize_perf_probe_arg(struct perf_probe_arg *pa, char *buf, size_t len) +char *synthesize_perf_probe_arg(struct perf_probe_arg *pa) { struct perf_probe_arg_field *field = pa->field; - int ret; - char *tmp = buf; + struct strbuf buf; + char *ret = NULL; + int err; + + if (strbuf_init(&buf, 64) < 0) + return NULL; if (pa->name && pa->var) - ret = e_snprintf(tmp, len, "%s=%s", pa->name, pa->var); + err = strbuf_addf(&buf, "%s=%s", pa->name, pa->var); else - ret = e_snprintf(tmp, len, "%s", pa->name ? pa->name : pa->var); - if (ret <= 0) - goto error; - tmp += ret; - len -= ret; + err = strbuf_addstr(&buf, pa->name ?: pa->var); + if (err) + goto out; while (field) { if (field->name[0] == '[') - ret = e_snprintf(tmp, len, "%s", field->name); + err = strbuf_addstr(&buf, field->name); else - ret = e_snprintf(tmp, len, "%s%s", - field->ref ? "->" : ".", field->name); - if (ret <= 0) - goto error; - tmp += ret; - len -= ret; + err = strbuf_addf(&buf, "%s%s", field->ref ? "->" : ".", + field->name); field = field->next; + if (err) + goto out; } - if (pa->type) { - ret = e_snprintf(tmp, len, ":%s", pa->type); - if (ret <= 0) - goto error; - tmp += ret; - len -= ret; - } + if (pa->type) + if (strbuf_addf(&buf, ":%s", pa->type) < 0) + goto out; - return tmp - buf; -error: - pr_debug("Failed to synthesize perf probe argument: %d\n", ret); + ret = strbuf_detach(&buf, NULL); +out: + strbuf_release(&buf); return ret; } /* Compose only probe point (not argument) */ static char *synthesize_perf_probe_point(struct perf_probe_point *pp) { - char *buf, *tmp; - char offs[32] = "", line[32] = "", file[32] = ""; - int ret, len; + struct strbuf buf; + char *tmp, *ret = NULL; + int len, err = 0; - buf = zalloc(MAX_CMDLEN); - if (buf == NULL) { - ret = -ENOMEM; - goto error; - } - if (pp->offset) { - ret = e_snprintf(offs, 32, "+%lu", pp->offset); - if (ret <= 0) - goto error; - } - if (pp->line) { - ret = e_snprintf(line, 32, ":%d", pp->line); - if (ret <= 0) - goto error; + if (strbuf_init(&buf, 64) < 0) + return NULL; + + if (pp->function) { + if (strbuf_addstr(&buf, pp->function) < 0) + goto out; + if (pp->offset) + err = strbuf_addf(&buf, "+%lu", pp->offset); + else if (pp->line) + err = strbuf_addf(&buf, ":%d", pp->line); + else if (pp->retprobe) + err = strbuf_addstr(&buf, "%return"); + if (err) + goto out; } if (pp->file) { tmp = pp->file; @@ -1689,25 +1740,15 @@ static char *synthesize_perf_probe_point(struct perf_probe_point *pp) tmp = strchr(pp->file + len - 30, '/'); tmp = tmp ? tmp + 1 : pp->file + len - 30; } - ret = e_snprintf(file, 32, "@%s", tmp); - if (ret <= 0) - goto error; + err = strbuf_addf(&buf, "@%s", tmp); + if (!err && !pp->function && pp->line) + err = strbuf_addf(&buf, ":%d", pp->line); } - - if (pp->function) - ret = e_snprintf(buf, MAX_CMDLEN, "%s%s%s%s%s", pp->function, - offs, pp->retprobe ? "%return" : "", line, - file); - else - ret = e_snprintf(buf, MAX_CMDLEN, "%s%s", file, line); - if (ret <= 0) - goto error; - - return buf; -error: - pr_debug("Failed to synthesize perf probe point: %d\n", ret); - free(buf); - return NULL; + if (!err) + ret = strbuf_detach(&buf, NULL); +out: + strbuf_release(&buf); + return ret; } #if 0 @@ -1736,45 +1777,32 @@ char *synthesize_perf_probe_command(struct perf_probe_event *pev) #endif static int __synthesize_probe_trace_arg_ref(struct probe_trace_arg_ref *ref, - char **buf, size_t *buflen, - int depth) + struct strbuf *buf, int depth) { - int ret; + int err; if (ref->next) { depth = __synthesize_probe_trace_arg_ref(ref->next, buf, - buflen, depth + 1); + depth + 1); if (depth < 0) - goto out; - } - - ret = e_snprintf(*buf, *buflen, "%+ld(", ref->offset); - if (ret < 0) - depth = ret; - else { - *buf += ret; - *buflen -= ret; + return depth; } -out: - return depth; - + err = strbuf_addf(buf, "%+ld(", ref->offset); + return (err < 0) ? err : depth; } static int synthesize_probe_trace_arg(struct probe_trace_arg *arg, - char *buf, size_t buflen) + struct strbuf *buf) { struct probe_trace_arg_ref *ref = arg->ref; - int ret, depth = 0; - char *tmp = buf; + int depth = 0, err; /* Argument name or separator */ if (arg->name) - ret = e_snprintf(buf, buflen, " %s=", arg->name); + err = strbuf_addf(buf, " %s=", arg->name); else - ret = e_snprintf(buf, buflen, " "); - if (ret < 0) - return ret; - buf += ret; - buflen -= ret; + err = strbuf_addch(buf, ' '); + if (err) + return err; /* Special case: @XXX */ if (arg->value[0] == '@' && arg->ref) @@ -1782,59 +1810,44 @@ static int synthesize_probe_trace_arg(struct probe_trace_arg *arg, /* Dereferencing arguments */ if (ref) { - depth = __synthesize_probe_trace_arg_ref(ref, &buf, - &buflen, 1); + depth = __synthesize_probe_trace_arg_ref(ref, buf, 1); if (depth < 0) return depth; } /* Print argument value */ if (arg->value[0] == '@' && arg->ref) - ret = e_snprintf(buf, buflen, "%s%+ld", arg->value, - arg->ref->offset); + err = strbuf_addf(buf, "%s%+ld", arg->value, arg->ref->offset); else - ret = e_snprintf(buf, buflen, "%s", arg->value); - if (ret < 0) - return ret; - buf += ret; - buflen -= ret; + err = strbuf_addstr(buf, arg->value); /* Closing */ - while (depth--) { - ret = e_snprintf(buf, buflen, ")"); - if (ret < 0) - return ret; - buf += ret; - buflen -= ret; - } + while (!err && depth--) + err = strbuf_addch(buf, ')'); + /* Print argument type */ - if (arg->type) { - ret = e_snprintf(buf, buflen, ":%s", arg->type); - if (ret <= 0) - return ret; - buf += ret; - } + if (!err && arg->type) + err = strbuf_addf(buf, ":%s", arg->type); - return buf - tmp; + return err; } char *synthesize_probe_trace_command(struct probe_trace_event *tev) { struct probe_trace_point *tp = &tev->point; - char *buf; - int i, len, ret; + struct strbuf buf; + char *ret = NULL; + int i, err; - buf = zalloc(MAX_CMDLEN); - if (buf == NULL) + /* Uprobes must have tp->module */ + if (tev->uprobes && !tp->module) return NULL; - len = e_snprintf(buf, MAX_CMDLEN, "%c:%s/%s ", tp->retprobe ? 'r' : 'p', - tev->group, tev->event); - if (len <= 0) - goto error; + if (strbuf_init(&buf, 32) < 0) + return NULL; - /* Uprobes must have tp->module */ - if (tev->uprobes && !tp->module) + if (strbuf_addf(&buf, "%c:%s/%s ", tp->retprobe ? 'r' : 'p', + tev->group, tev->event) < 0) goto error; /* * If tp->address == 0, then this point must be a @@ -1849,34 +1862,25 @@ char *synthesize_probe_trace_command(struct probe_trace_event *tev) /* Use the tp->address for uprobes */ if (tev->uprobes) - ret = e_snprintf(buf + len, MAX_CMDLEN - len, "%s:0x%lx", - tp->module, tp->address); + err = strbuf_addf(&buf, "%s:0x%lx", tp->module, tp->address); else if (!strncmp(tp->symbol, "0x", 2)) /* Absolute address. See try_to_find_absolute_address() */ - ret = e_snprintf(buf + len, MAX_CMDLEN - len, "%s%s0x%lx", - tp->module ?: "", tp->module ? ":" : "", - tp->address); + err = strbuf_addf(&buf, "%s%s0x%lx", tp->module ?: "", + tp->module ? ":" : "", tp->address); else - ret = e_snprintf(buf + len, MAX_CMDLEN - len, "%s%s%s+%lu", - tp->module ?: "", tp->module ? ":" : "", - tp->symbol, tp->offset); - - if (ret <= 0) + err = strbuf_addf(&buf, "%s%s%s+%lu", tp->module ?: "", + tp->module ? ":" : "", tp->symbol, tp->offset); + if (err) goto error; - len += ret; - for (i = 0; i < tev->nargs; i++) { - ret = synthesize_probe_trace_arg(&tev->args[i], buf + len, - MAX_CMDLEN - len); - if (ret <= 0) + for (i = 0; i < tev->nargs; i++) + if (synthesize_probe_trace_arg(&tev->args[i], &buf) < 0) goto error; - len += ret; - } - return buf; + ret = strbuf_detach(&buf, NULL); error: - free(buf); - return NULL; + strbuf_release(&buf); + return ret; } static int find_perf_probe_point_from_map(struct probe_trace_point *tp, @@ -1958,7 +1962,7 @@ static int convert_to_perf_probe_point(struct probe_trace_point *tp, static int convert_to_perf_probe_event(struct probe_trace_event *tev, struct perf_probe_event *pev, bool is_kprobe) { - char buf[64] = ""; + struct strbuf buf = STRBUF_INIT; int i, ret; /* Convert event/group name */ @@ -1981,14 +1985,15 @@ static int convert_to_perf_probe_event(struct probe_trace_event *tev, if (tev->args[i].name) pev->args[i].name = strdup(tev->args[i].name); else { - ret = synthesize_probe_trace_arg(&tev->args[i], - buf, 64); - pev->args[i].name = strdup(buf); + if ((ret = strbuf_init(&buf, 32)) < 0) + goto error; + ret = synthesize_probe_trace_arg(&tev->args[i], &buf); + pev->args[i].name = strbuf_detach(&buf, NULL); } if (pev->args[i].name == NULL && ret >= 0) ret = -ENOMEM; } - +error: if (ret < 0) clear_perf_probe_event(pev); @@ -2162,35 +2167,38 @@ static int perf_probe_event__sprintf(const char *group, const char *event, struct strbuf *result) { int i, ret; - char buf[128]; - char *place; + char *buf; - /* Synthesize only event probe point */ - place = synthesize_perf_probe_point(&pev->point); - if (!place) - return -EINVAL; + if (asprintf(&buf, "%s:%s", group, event) < 0) + return -errno; + ret = strbuf_addf(result, " %-20s (on ", buf); + free(buf); + if (ret) + return ret; - ret = e_snprintf(buf, 128, "%s:%s", group, event); - if (ret < 0) - goto out; + /* Synthesize only event probe point */ + buf = synthesize_perf_probe_point(&pev->point); + if (!buf) + return -ENOMEM; + ret = strbuf_addstr(result, buf); + free(buf); - strbuf_addf(result, " %-20s (on %s", buf, place); - if (module) - strbuf_addf(result, " in %s", module); + if (!ret && module) + ret = strbuf_addf(result, " in %s", module); - if (pev->nargs > 0) { - strbuf_add(result, " with", 5); - for (i = 0; i < pev->nargs; i++) { - ret = synthesize_perf_probe_arg(&pev->args[i], - buf, 128); - if (ret < 0) - goto out; - strbuf_addf(result, " %s", buf); + if (!ret && pev->nargs > 0) { + ret = strbuf_add(result, " with", 5); + for (i = 0; !ret && i < pev->nargs; i++) { + buf = synthesize_perf_probe_arg(&pev->args[i]); + if (!buf) + return -ENOMEM; + ret = strbuf_addf(result, " %s", buf); + free(buf); } } - strbuf_addch(result, ')'); -out: - free(place); + if (!ret) + ret = strbuf_addch(result, ')'); + return ret; } @@ -2498,7 +2506,8 @@ static int find_probe_functions(struct map *map, char *name, void __weak arch__fix_tev_from_maps(struct perf_probe_event *pev __maybe_unused, struct probe_trace_event *tev __maybe_unused, - struct map *map __maybe_unused) { } + struct map *map __maybe_unused, + struct symbol *sym __maybe_unused) { } /* * Find probe function addresses from map. @@ -2516,6 +2525,7 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev, struct probe_trace_point *tp; int num_matched_functions; int ret, i, j, skipped = 0; + char *mod_name; map = get_target_map(pev->target, pev->uprobes); if (!map) { @@ -2600,9 +2610,19 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev, tp->realname = strdup_or_goto(sym->name, nomem_out); tp->retprobe = pp->retprobe; - if (pev->target) - tev->point.module = strdup_or_goto(pev->target, - nomem_out); + if (pev->target) { + if (pev->uprobes) { + tev->point.module = strdup_or_goto(pev->target, + nomem_out); + } else { + mod_name = find_module_name(pev->target); + tev->point.module = + strdup(mod_name ? mod_name : pev->target); + free(mod_name); + if (!tev->point.module) + goto nomem_out; + } + } tev->uprobes = pev->uprobes; tev->nargs = pev->nargs; if (tev->nargs) { @@ -2624,7 +2644,7 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev, strdup_or_goto(pev->args[i].type, nomem_out); } - arch__fix_tev_from_maps(pev, tev, map); + arch__fix_tev_from_maps(pev, tev, map, sym); } if (ret == skipped) { ret = -ENOENT; @@ -2743,9 +2763,13 @@ static int convert_to_probe_trace_events(struct perf_probe_event *pev, { int ret; - if (pev->uprobes && !pev->group) { - /* Replace group name if not given */ - ret = convert_exec_to_group(pev->target, &pev->group); + if (!pev->group) { + /* Set group name if not given */ + if (!pev->uprobes) { + pev->group = strdup(PERFPROBE_GROUP); + ret = pev->group ? 0 : -ENOMEM; + } else + ret = convert_exec_to_group(pev->target, &pev->group); if (ret != 0) { pr_warning("Failed to make a group name.\n"); return ret; diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h index e54e7b011577..5a27eb4fad05 100644 --- a/tools/perf/util/probe-event.h +++ b/tools/perf/util/probe-event.h @@ -120,7 +120,7 @@ int parse_probe_trace_command(const char *cmd, struct probe_trace_event *tev); /* Events to command string */ char *synthesize_perf_probe_command(struct perf_probe_event *pev); char *synthesize_probe_trace_command(struct probe_trace_event *tev); -int synthesize_perf_probe_arg(struct perf_probe_arg *pa, char *buf, size_t len); +char *synthesize_perf_probe_arg(struct perf_probe_arg *pa); /* Check the perf_probe_event needs debuginfo */ bool perf_probe_event_need_dwarf(struct perf_probe_event *pev); @@ -154,7 +154,8 @@ int show_available_vars(struct perf_probe_event *pevs, int npevs, int show_available_funcs(const char *module, struct strfilter *filter, bool user); bool arch__prefers_symtab(void); void arch__fix_tev_from_maps(struct perf_probe_event *pev, - struct probe_trace_event *tev, struct map *map); + struct probe_trace_event *tev, struct map *map, + struct symbol *sym); /* If there is no space to write, returns -E2BIG. */ int e_snprintf(char *str, size_t size, const char *format, ...) diff --git a/tools/perf/util/probe-file.c b/tools/perf/util/probe-file.c index e3b3b92e4458..3fe6214970e6 100644 --- a/tools/perf/util/probe-file.c +++ b/tools/perf/util/probe-file.c @@ -220,8 +220,7 @@ int probe_file__add_event(int fd, struct probe_trace_event *tev) pr_debug("Writing event: %s\n", buf); if (!probe_event_dry_run) { - ret = write(fd, buf, strlen(buf)); - if (ret <= 0) { + if (write(fd, buf, strlen(buf)) < (int)strlen(buf)) { ret = -errno; pr_warning("Failed to write event: %s\n", strerror_r(errno, sbuf, sizeof(sbuf))); diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index b3bd0fba0237..1259839dbf6d 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -553,7 +553,7 @@ static int convert_variable(Dwarf_Die *vr_die, struct probe_finder *pf) static int find_variable(Dwarf_Die *sc_die, struct probe_finder *pf) { Dwarf_Die vr_die; - char buf[32], *ptr; + char *buf, *ptr; int ret = 0; /* Copy raw parameters */ @@ -563,13 +563,13 @@ static int find_variable(Dwarf_Die *sc_die, struct probe_finder *pf) if (pf->pvar->name) pf->tvar->name = strdup(pf->pvar->name); else { - ret = synthesize_perf_probe_arg(pf->pvar, buf, 32); - if (ret < 0) - return ret; + buf = synthesize_perf_probe_arg(pf->pvar); + if (!buf) + return -ENOMEM; ptr = strchr(buf, ':'); /* Change type separator to _ */ if (ptr) *ptr = '_'; - pf->tvar->name = strdup(buf); + pf->tvar->name = buf; } if (pf->tvar->name == NULL) return -ENOMEM; @@ -1294,6 +1294,7 @@ static int collect_variables_cb(Dwarf_Die *die_mem, void *data) { struct available_var_finder *af = data; struct variable_list *vl; + struct strbuf buf = STRBUF_INIT; int tag, ret; vl = &af->vls[af->nvls - 1]; @@ -1307,25 +1308,26 @@ static int collect_variables_cb(Dwarf_Die *die_mem, void *data) if (ret == 0 || ret == -ERANGE) { int ret2; bool externs = !af->child; - struct strbuf buf; - strbuf_init(&buf, 64); + if (strbuf_init(&buf, 64) < 0) + goto error; if (probe_conf.show_location_range) { - if (!externs) { - if (ret) - strbuf_add(&buf, "[INV]\t", 6); - else - strbuf_add(&buf, "[VAL]\t", 6); - } else - strbuf_add(&buf, "[EXT]\t", 6); + if (!externs) + ret2 = strbuf_add(&buf, + ret ? "[INV]\t" : "[VAL]\t", 6); + else + ret2 = strbuf_add(&buf, "[EXT]\t", 6); + if (ret2) + goto error; } ret2 = die_get_varname(die_mem, &buf); if (!ret2 && probe_conf.show_location_range && !externs) { - strbuf_addch(&buf, '\t'); + if (strbuf_addch(&buf, '\t') < 0) + goto error; ret2 = die_get_var_range(&af->pf.sp_die, die_mem, &buf); } @@ -1343,6 +1345,10 @@ static int collect_variables_cb(Dwarf_Die *die_mem, void *data) return DIE_FIND_CB_CONTINUE; else return DIE_FIND_CB_SIBLING; +error: + strbuf_release(&buf); + pr_debug("Error in strbuf\n"); + return DIE_FIND_CB_END; } /* Add a found vars into available variables list */ diff --git a/tools/perf/util/python-ext-sources b/tools/perf/util/python-ext-sources index 8162ba0e2e57..36c6862119e3 100644 --- a/tools/perf/util/python-ext-sources +++ b/tools/perf/util/python-ext-sources @@ -23,3 +23,4 @@ util/strlist.c util/trace-event.c ../lib/rbtree.c util/string.c +util/symbol_fprintf.c diff --git a/tools/perf/util/quote.c b/tools/perf/util/quote.c index 01f03242b86a..c6d4ee2de752 100644 --- a/tools/perf/util/quote.c +++ b/tools/perf/util/quote.c @@ -17,38 +17,42 @@ static inline int need_bs_quote(char c) return (c == '\'' || c == '!'); } -static void sq_quote_buf(struct strbuf *dst, const char *src) +static int sq_quote_buf(struct strbuf *dst, const char *src) { char *to_free = NULL; + int ret; if (dst->buf == src) to_free = strbuf_detach(dst, NULL); - strbuf_addch(dst, '\''); - while (*src) { + ret = strbuf_addch(dst, '\''); + while (!ret && *src) { size_t len = strcspn(src, "'!"); - strbuf_add(dst, src, len); + ret = strbuf_add(dst, src, len); src += len; - while (need_bs_quote(*src)) { - strbuf_addstr(dst, "'\\"); - strbuf_addch(dst, *src++); - strbuf_addch(dst, '\''); - } + while (!ret && need_bs_quote(*src)) + ret = strbuf_addf(dst, "'\\%c\'", *src++); } - strbuf_addch(dst, '\''); + if (!ret) + ret = strbuf_addch(dst, '\''); free(to_free); + + return ret; } -void sq_quote_argv(struct strbuf *dst, const char** argv, size_t maxlen) +int sq_quote_argv(struct strbuf *dst, const char** argv, size_t maxlen) { - int i; + int i, ret; /* Copy into destination buffer. */ - strbuf_grow(dst, 255); - for (i = 0; argv[i]; ++i) { - strbuf_addch(dst, ' '); - sq_quote_buf(dst, argv[i]); + ret = strbuf_grow(dst, 255); + for (i = 0; !ret && argv[i]; ++i) { + ret = strbuf_addch(dst, ' '); + if (ret) + break; + ret = sq_quote_buf(dst, argv[i]); if (maxlen && dst->len > maxlen) die("Too many or long arguments"); } + return ret; } diff --git a/tools/perf/util/quote.h b/tools/perf/util/quote.h index 3340c9c4a6ca..e1ec19146fb0 100644 --- a/tools/perf/util/quote.h +++ b/tools/perf/util/quote.h @@ -24,6 +24,6 @@ * sq_quote() in a real application. */ -void sq_quote_argv(struct strbuf *, const char **argv, size_t maxlen); +int sq_quote_argv(struct strbuf *, const char **argv, size_t maxlen); #endif /* __PERF_QUOTE_H */ diff --git a/tools/perf/util/rb_resort.h b/tools/perf/util/rb_resort.h new file mode 100644 index 000000000000..abc76e3d3098 --- /dev/null +++ b/tools/perf/util/rb_resort.h @@ -0,0 +1,149 @@ +#ifndef _PERF_RESORT_RB_H_ +#define _PERF_RESORT_RB_H_ +/* + * Template for creating a class to resort an existing rb_tree according to + * a new sort criteria, that must be present in the entries of the source + * rb_tree. + * + * (c) 2016 Arnaldo Carvalho de Melo <acme@redhat.com> + * + * Quick example, resorting threads by its shortname: + * + * First define the prefix (threads) to be used for the functions and data + * structures created, and provide an expression for the sorting, then the + * fields to be present in each of the entries in the new, sorted, rb_tree. + * + * The body of the init function should collect the fields, maybe + * pre-calculating them from multiple entries in the original 'entry' from + * the rb_tree used as a source for the entries to be sorted: + +DEFINE_RB_RESORT_RB(threads, strcmp(a->thread->shortname, + b->thread->shortname) < 0, + struct thread *thread; +) +{ + entry->thread = rb_entry(nd, struct thread, rb_node); +} + + * After this it is just a matter of instantiating it and iterating it, + * for a few data structures with existing rb_trees, such as 'struct machine', + * helpers are available to get the rb_root and the nr_entries: + + DECLARE_RESORT_RB_MACHINE_THREADS(threads, machine_ptr); + + * This will instantiate the new rb_tree and a cursor for it, that can be used as: + + struct rb_node *nd; + + resort_rb__for_each(nd, threads) { + struct thread *t = threads_entry; + printf("%s: %d\n", t->shortname, t->tid); + } + + * Then delete it: + + resort_rb__delete(threads); + + * The name of the data structures and functions will have a _sorted suffix + * right before the method names, i.e. will look like: + * + * struct threads_sorted_entry {} + * threads_sorted__insert() + */ + +#define DEFINE_RESORT_RB(__name, __comp, ...) \ +struct __name##_sorted_entry { \ + struct rb_node rb_node; \ + __VA_ARGS__ \ +}; \ +static void __name##_sorted__init_entry(struct rb_node *nd, \ + struct __name##_sorted_entry *entry); \ + \ +static int __name##_sorted__cmp(struct rb_node *nda, struct rb_node *ndb) \ +{ \ + struct __name##_sorted_entry *a, *b; \ + a = rb_entry(nda, struct __name##_sorted_entry, rb_node); \ + b = rb_entry(ndb, struct __name##_sorted_entry, rb_node); \ + return __comp; \ +} \ + \ +struct __name##_sorted { \ + struct rb_root entries; \ + struct __name##_sorted_entry nd[0]; \ +}; \ + \ +static void __name##_sorted__insert(struct __name##_sorted *sorted, \ + struct rb_node *sorted_nd) \ +{ \ + struct rb_node **p = &sorted->entries.rb_node, *parent = NULL; \ + while (*p != NULL) { \ + parent = *p; \ + if (__name##_sorted__cmp(sorted_nd, parent)) \ + p = &(*p)->rb_left; \ + else \ + p = &(*p)->rb_right; \ + } \ + rb_link_node(sorted_nd, parent, p); \ + rb_insert_color(sorted_nd, &sorted->entries); \ +} \ + \ +static void __name##_sorted__sort(struct __name##_sorted *sorted, \ + struct rb_root *entries) \ +{ \ + struct rb_node *nd; \ + unsigned int i = 0; \ + for (nd = rb_first(entries); nd; nd = rb_next(nd)) { \ + struct __name##_sorted_entry *snd = &sorted->nd[i++]; \ + __name##_sorted__init_entry(nd, snd); \ + __name##_sorted__insert(sorted, &snd->rb_node); \ + } \ +} \ + \ +static struct __name##_sorted *__name##_sorted__new(struct rb_root *entries, \ + int nr_entries) \ +{ \ + struct __name##_sorted *sorted; \ + sorted = malloc(sizeof(*sorted) + sizeof(sorted->nd[0]) * nr_entries); \ + if (sorted) { \ + sorted->entries = RB_ROOT; \ + __name##_sorted__sort(sorted, entries); \ + } \ + return sorted; \ +} \ + \ +static void __name##_sorted__delete(struct __name##_sorted *sorted) \ +{ \ + free(sorted); \ +} \ + \ +static void __name##_sorted__init_entry(struct rb_node *nd, \ + struct __name##_sorted_entry *entry) + +#define DECLARE_RESORT_RB(__name) \ +struct __name##_sorted_entry *__name##_entry; \ +struct __name##_sorted *__name = __name##_sorted__new + +#define resort_rb__for_each(__nd, __name) \ + for (__nd = rb_first(&__name->entries); \ + __name##_entry = rb_entry(__nd, struct __name##_sorted_entry, \ + rb_node), __nd; \ + __nd = rb_next(__nd)) + +#define resort_rb__delete(__name) \ + __name##_sorted__delete(__name), __name = NULL + +/* + * Helpers for other classes that contains both an rbtree and the + * number of entries in it: + */ + +/* For 'struct intlist' */ +#define DECLARE_RESORT_RB_INTLIST(__name, __ilist) \ + DECLARE_RESORT_RB(__name)(&__ilist->rblist.entries, \ + __ilist->rblist.nr_entries) + +/* For 'struct machine->threads' */ +#define DECLARE_RESORT_RB_MACHINE_THREADS(__name, __machine) \ + DECLARE_RESORT_RB(__name)(&__machine->threads, __machine->nr_threads) + +#endif /* _PERF_RESORT_RB_H_ */ diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c index 0467367dc315..481792c7484b 100644 --- a/tools/perf/util/record.c +++ b/tools/perf/util/record.c @@ -129,7 +129,8 @@ bool perf_can_record_cpu_wide(void) return true; } -void perf_evlist__config(struct perf_evlist *evlist, struct record_opts *opts) +void perf_evlist__config(struct perf_evlist *evlist, struct record_opts *opts, + struct callchain_param *callchain) { struct perf_evsel *evsel; bool use_sample_identifier = false; @@ -148,7 +149,7 @@ void perf_evlist__config(struct perf_evlist *evlist, struct record_opts *opts) use_comm_exec = perf_can_comm_exec(); evlist__for_each(evlist, evsel) { - perf_evsel__config(evsel, opts); + perf_evsel__config(evsel, opts, callchain); if (evsel->tracking && use_comm_exec) evsel->attr.comm_exec = 1; } diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c index b3aabc0d4eb0..62c7f6988e0e 100644 --- a/tools/perf/util/scripting-engines/trace-event-perl.c +++ b/tools/perf/util/scripting-engines/trace-event-perl.c @@ -31,6 +31,8 @@ #include <perl.h> #include "../../perf.h" +#include "../callchain.h" +#include "../machine.h" #include "../thread.h" #include "../event.h" #include "../trace-event.h" @@ -248,10 +250,90 @@ static void define_event_symbols(struct event_format *event, define_event_symbols(event, ev_name, args->next); } +static SV *perl_process_callchain(struct perf_sample *sample, + struct perf_evsel *evsel, + struct addr_location *al) +{ + AV *list; + + list = newAV(); + if (!list) + goto exit; + + if (!symbol_conf.use_callchain || !sample->callchain) + goto exit; + + if (thread__resolve_callchain(al->thread, &callchain_cursor, evsel, + sample, NULL, NULL, + sysctl_perf_event_max_stack) != 0) { + pr_err("Failed to resolve callchain. Skipping\n"); + goto exit; + } + callchain_cursor_commit(&callchain_cursor); + + + while (1) { + HV *elem; + struct callchain_cursor_node *node; + node = callchain_cursor_current(&callchain_cursor); + if (!node) + break; + + elem = newHV(); + if (!elem) + goto exit; + + if (!hv_stores(elem, "ip", newSVuv(node->ip))) { + hv_undef(elem); + goto exit; + } + + if (node->sym) { + HV *sym = newHV(); + if (!sym) { + hv_undef(elem); + goto exit; + } + if (!hv_stores(sym, "start", newSVuv(node->sym->start)) || + !hv_stores(sym, "end", newSVuv(node->sym->end)) || + !hv_stores(sym, "binding", newSVuv(node->sym->binding)) || + !hv_stores(sym, "name", newSVpvn(node->sym->name, + node->sym->namelen)) || + !hv_stores(elem, "sym", newRV_noinc((SV*)sym))) { + hv_undef(sym); + hv_undef(elem); + goto exit; + } + } + + if (node->map) { + struct map *map = node->map; + const char *dsoname = "[unknown]"; + if (map && map->dso && (map->dso->name || map->dso->long_name)) { + if (symbol_conf.show_kernel_path && map->dso->long_name) + dsoname = map->dso->long_name; + else if (map->dso->name) + dsoname = map->dso->name; + } + if (!hv_stores(elem, "dso", newSVpv(dsoname,0))) { + hv_undef(elem); + goto exit; + } + } + + callchain_cursor_advance(&callchain_cursor); + av_push(list, newRV_noinc((SV*)elem)); + } + +exit: + return newRV_noinc((SV*)list); +} + static void perl_process_tracepoint(struct perf_sample *sample, struct perf_evsel *evsel, - struct thread *thread) + struct addr_location *al) { + struct thread *thread = al->thread; struct event_format *event = evsel->tp_format; struct format_field *field; static char handler[256]; @@ -295,6 +377,7 @@ static void perl_process_tracepoint(struct perf_sample *sample, XPUSHs(sv_2mortal(newSVuv(ns))); XPUSHs(sv_2mortal(newSViv(pid))); XPUSHs(sv_2mortal(newSVpv(comm, 0))); + XPUSHs(sv_2mortal(perl_process_callchain(sample, evsel, al))); /* common fields other than pid can be accessed via xsub fns */ @@ -329,6 +412,7 @@ static void perl_process_tracepoint(struct perf_sample *sample, XPUSHs(sv_2mortal(newSVuv(nsecs))); XPUSHs(sv_2mortal(newSViv(pid))); XPUSHs(sv_2mortal(newSVpv(comm, 0))); + XPUSHs(sv_2mortal(perl_process_callchain(sample, evsel, al))); call_pv("main::trace_unhandled", G_SCALAR); } SPAGAIN; @@ -366,7 +450,7 @@ static void perl_process_event(union perf_event *event, struct perf_evsel *evsel, struct addr_location *al) { - perl_process_tracepoint(sample, evsel, al->thread); + perl_process_tracepoint(sample, evsel, al); perl_process_event_generic(event, sample, evsel); } @@ -490,7 +574,27 @@ static int perl_generate_script(struct pevent *pevent, const char *outfile) fprintf(ofp, "use Perf::Trace::Util;\n\n"); fprintf(ofp, "sub trace_begin\n{\n\t# optional\n}\n\n"); - fprintf(ofp, "sub trace_end\n{\n\t# optional\n}\n\n"); + fprintf(ofp, "sub trace_end\n{\n\t# optional\n}\n"); + + + fprintf(ofp, "\n\ +sub print_backtrace\n\ +{\n\ + my $callchain = shift;\n\ + for my $node (@$callchain)\n\ + {\n\ + if(exists $node->{sym})\n\ + {\n\ + printf( \"\\t[\\%%x] \\%%s\\n\", $node->{ip}, $node->{sym}{name});\n\ + }\n\ + else\n\ + {\n\ + printf( \"\\t[\\%%x]\\n\", $node{ip});\n\ + }\n\ + }\n\ +}\n\n\ +"); + while ((event = trace_find_next_event(pevent, event))) { fprintf(ofp, "sub %s::%s\n{\n", event->system, event->name); @@ -502,7 +606,8 @@ static int perl_generate_script(struct pevent *pevent, const char *outfile) fprintf(ofp, "$common_secs, "); fprintf(ofp, "$common_nsecs,\n"); fprintf(ofp, "\t $common_pid, "); - fprintf(ofp, "$common_comm,\n\t "); + fprintf(ofp, "$common_comm, "); + fprintf(ofp, "$common_callchain,\n\t "); not_first = 0; count = 0; @@ -519,7 +624,7 @@ static int perl_generate_script(struct pevent *pevent, const char *outfile) fprintf(ofp, "\tprint_header($event_name, $common_cpu, " "$common_secs, $common_nsecs,\n\t " - "$common_pid, $common_comm);\n\n"); + "$common_pid, $common_comm, $common_callchain);\n\n"); fprintf(ofp, "\tprintf(\""); @@ -581,17 +686,22 @@ static int perl_generate_script(struct pevent *pevent, const char *outfile) fprintf(ofp, "$%s", f->name); } - fprintf(ofp, ");\n"); + fprintf(ofp, ");\n\n"); + + fprintf(ofp, "\tprint_backtrace($common_callchain);\n"); + fprintf(ofp, "}\n\n"); } fprintf(ofp, "sub trace_unhandled\n{\n\tmy ($event_name, $context, " "$common_cpu, $common_secs, $common_nsecs,\n\t " - "$common_pid, $common_comm) = @_;\n\n"); + "$common_pid, $common_comm, $common_callchain) = @_;\n\n"); fprintf(ofp, "\tprint_header($event_name, $common_cpu, " "$common_secs, $common_nsecs,\n\t $common_pid, " - "$common_comm);\n}\n\n"); + "$common_comm, $common_callchain);\n"); + fprintf(ofp, "\tprint_backtrace($common_callchain);\n"); + fprintf(ofp, "}\n\n"); fprintf(ofp, "sub print_header\n{\n" "\tmy ($event_name, $cpu, $secs, $nsecs, $pid, $comm) = @_;\n\n" diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index fbd05242b4e5..ff134700bf30 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -41,6 +41,7 @@ #include "../thread-stack.h" #include "../trace-event.h" #include "../machine.h" +#include "../call-path.h" #include "thread_map.h" #include "cpumap.h" #include "stat.h" @@ -323,7 +324,7 @@ static PyObject *python_process_callchain(struct perf_sample *sample, if (!symbol_conf.use_callchain || !sample->callchain) goto exit; - if (thread__resolve_callchain(al->thread, evsel, + if (thread__resolve_callchain(al->thread, &callchain_cursor, evsel, sample, NULL, NULL, scripting_max_stack) != 0) { pr_err("Failed to resolve callchain. Skipping\n"); @@ -407,8 +408,11 @@ static void python_process_tracepoint(struct perf_sample *sample, if (!t) Py_FatalError("couldn't create Python tuple"); - if (!event) - die("ug! no event found for type %d", (int)evsel->attr.config); + if (!event) { + snprintf(handler_name, sizeof(handler_name), + "ug! no event found for type %" PRIu64, (u64)evsel->attr.config); + Py_FatalError(handler_name); + } pid = raw_field_value(event, "common_pid", data); @@ -614,7 +618,7 @@ static int python_export_dso(struct db_export *dbe, struct dso *dso, struct machine *machine) { struct tables *tables = container_of(dbe, struct tables, dbe); - char sbuild_id[BUILD_ID_SIZE * 2 + 1]; + char sbuild_id[SBUILD_ID_SIZE]; PyObject *t; build_id__sprintf(dso->build_id, sizeof(dso->build_id), sbuild_id); @@ -681,7 +685,7 @@ static int python_export_sample(struct db_export *dbe, struct tables *tables = container_of(dbe, struct tables, dbe); PyObject *t; - t = tuple_new(21); + t = tuple_new(22); tuple_set_u64(t, 0, es->db_id); tuple_set_u64(t, 1, es->evsel->db_id); @@ -704,6 +708,7 @@ static int python_export_sample(struct db_export *dbe, tuple_set_u64(t, 18, es->sample->data_src); tuple_set_s32(t, 19, es->sample->flags & PERF_BRANCH_MASK); tuple_set_s32(t, 20, !!(es->sample->flags & PERF_IP_FLAG_IN_TX)); + tuple_set_u64(t, 21, es->call_path_id); call_object(tables->sample_handler, t, "sample_table"); @@ -998,8 +1003,10 @@ static void set_table_handlers(struct tables *tables) { const char *perf_db_export_mode = "perf_db_export_mode"; const char *perf_db_export_calls = "perf_db_export_calls"; - PyObject *db_export_mode, *db_export_calls; + const char *perf_db_export_callchains = "perf_db_export_callchains"; + PyObject *db_export_mode, *db_export_calls, *db_export_callchains; bool export_calls = false; + bool export_callchains = false; int ret; memset(tables, 0, sizeof(struct tables)); @@ -1016,6 +1023,7 @@ static void set_table_handlers(struct tables *tables) if (!ret) return; + /* handle export calls */ tables->dbe.crp = NULL; db_export_calls = PyDict_GetItemString(main_dict, perf_db_export_calls); if (db_export_calls) { @@ -1033,6 +1041,33 @@ static void set_table_handlers(struct tables *tables) Py_FatalError("failed to create calls processor"); } + /* handle export callchains */ + tables->dbe.cpr = NULL; + db_export_callchains = PyDict_GetItemString(main_dict, + perf_db_export_callchains); + if (db_export_callchains) { + ret = PyObject_IsTrue(db_export_callchains); + if (ret == -1) + handler_call_die(perf_db_export_callchains); + export_callchains = !!ret; + } + + if (export_callchains) { + /* + * Attempt to use the call path root from the call return + * processor, if the call return processor is in use. Otherwise, + * we allocate a new call path root. This prevents exporting + * duplicate call path ids when both are in use simultaniously. + */ + if (tables->dbe.crp) + tables->dbe.cpr = tables->dbe.crp->cpr; + else + tables->dbe.cpr = call_path_root__new(); + + if (!tables->dbe.cpr) + Py_FatalError("failed to create call path root"); + } + tables->db_export_mode = true; /* * Reserve per symbol space for symbol->db_id via symbol__priv() diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 4abd85c6346d..2335b2824d8a 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -409,6 +409,8 @@ void perf_tool__fill_defaults(struct perf_tool *tool) tool->stat = process_stat_stub; if (tool->stat_round == NULL) tool->stat_round = process_stat_round_stub; + if (tool->time_conv == NULL) + tool->time_conv = process_event_op2_stub; } static void swap_sample_id_all(union perf_event *event, void *data) @@ -794,6 +796,7 @@ static perf_event__swap_op perf_event__swap_ops[] = { [PERF_RECORD_STAT] = perf_event__stat_swap, [PERF_RECORD_STAT_ROUND] = perf_event__stat_round_swap, [PERF_RECORD_EVENT_UPDATE] = perf_event__event_update_swap, + [PERF_RECORD_TIME_CONV] = perf_event__all64_swap, [PERF_RECORD_HEADER_MAX] = NULL, }; @@ -904,7 +907,7 @@ static void callchain__printf(struct perf_evsel *evsel, unsigned int i; struct ip_callchain *callchain = sample->callchain; - if (has_branch_callstack(evsel)) + if (perf_evsel__has_branch_callstack(evsel)) callchain__lbr_callstack_printf(sample); printf("... FP chain: nr:%" PRIu64 "\n", callchain->nr); @@ -1078,7 +1081,7 @@ static void dump_sample(struct perf_evsel *evsel, union perf_event *event, if (sample_type & PERF_SAMPLE_CALLCHAIN) callchain__printf(evsel, sample); - if ((sample_type & PERF_SAMPLE_BRANCH_STACK) && !has_branch_callstack(evsel)) + if ((sample_type & PERF_SAMPLE_BRANCH_STACK) && !perf_evsel__has_branch_callstack(evsel)) branch_stack__printf(sample); if (sample_type & PERF_SAMPLE_REGS_USER) @@ -1341,6 +1344,9 @@ static s64 perf_session__process_user_event(struct perf_session *session, return tool->stat(tool, event, session); case PERF_RECORD_STAT_ROUND: return tool->stat_round(tool, event, session); + case PERF_RECORD_TIME_CONV: + session->time_conv = event->time_conv; + return tool->time_conv(tool, event, session); default: return -EINVAL; } @@ -1830,7 +1836,11 @@ out: out_err: ui_progress__finish(); perf_session__warn_about_errors(session); - ordered_events__free(&session->ordered_events); + /* + * We may switching perf.data output, make ordered_events + * reusable. + */ + ordered_events__reinit(&session->ordered_events); auxtrace__free_events(session); session->one_mmap = false; return err; @@ -1947,105 +1957,6 @@ struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session, return NULL; } -void perf_evsel__print_ip(struct perf_evsel *evsel, struct perf_sample *sample, - struct addr_location *al, - unsigned int print_opts, unsigned int stack_depth) -{ - struct callchain_cursor_node *node; - int print_ip = print_opts & PRINT_IP_OPT_IP; - int print_sym = print_opts & PRINT_IP_OPT_SYM; - int print_dso = print_opts & PRINT_IP_OPT_DSO; - int print_symoffset = print_opts & PRINT_IP_OPT_SYMOFFSET; - int print_oneline = print_opts & PRINT_IP_OPT_ONELINE; - int print_srcline = print_opts & PRINT_IP_OPT_SRCLINE; - char s = print_oneline ? ' ' : '\t'; - - if (symbol_conf.use_callchain && sample->callchain) { - struct addr_location node_al; - - if (thread__resolve_callchain(al->thread, evsel, - sample, NULL, NULL, - stack_depth) != 0) { - if (verbose) - error("Failed to resolve callchain. Skipping\n"); - return; - } - callchain_cursor_commit(&callchain_cursor); - - if (print_symoffset) - node_al = *al; - - while (stack_depth) { - u64 addr = 0; - - node = callchain_cursor_current(&callchain_cursor); - if (!node) - break; - - if (node->sym && node->sym->ignore) - goto next; - - if (print_ip) - printf("%c%16" PRIx64, s, node->ip); - - if (node->map) - addr = node->map->map_ip(node->map, node->ip); - - if (print_sym) { - printf(" "); - if (print_symoffset) { - node_al.addr = addr; - node_al.map = node->map; - symbol__fprintf_symname_offs(node->sym, &node_al, stdout); - } else - symbol__fprintf_symname(node->sym, stdout); - } - - if (print_dso) { - printf(" ("); - map__fprintf_dsoname(node->map, stdout); - printf(")"); - } - - if (print_srcline) - map__fprintf_srcline(node->map, addr, "\n ", - stdout); - - if (!print_oneline) - printf("\n"); - - stack_depth--; -next: - callchain_cursor_advance(&callchain_cursor); - } - - } else { - if (al->sym && al->sym->ignore) - return; - - if (print_ip) - printf("%16" PRIx64, sample->ip); - - if (print_sym) { - printf(" "); - if (print_symoffset) - symbol__fprintf_symname_offs(al->sym, al, - stdout); - else - symbol__fprintf_symname(al->sym, stdout); - } - - if (print_dso) { - printf(" ("); - map__fprintf_dsoname(al->map, stdout); - printf(")"); - } - - if (print_srcline) - map__fprintf_srcline(al->map, al->addr, "\n ", stdout); - } -} - int perf_session__cpu_bitmap(struct perf_session *session, const char *cpu_list, unsigned long *cpu_bitmap) { diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index 5f792e35d4c1..4bd758553450 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -26,6 +26,7 @@ struct perf_session { struct itrace_synth_opts *itrace_synth_opts; struct list_head auxtrace_index; struct trace_event tevent; + struct time_conv_event time_conv; bool repipe; bool one_mmap; void *one_mmap_addr; @@ -35,13 +36,6 @@ struct perf_session { struct perf_tool *tool; }; -#define PRINT_IP_OPT_IP (1<<0) -#define PRINT_IP_OPT_SYM (1<<1) -#define PRINT_IP_OPT_DSO (1<<2) -#define PRINT_IP_OPT_SYMOFFSET (1<<3) -#define PRINT_IP_OPT_ONELINE (1<<4) -#define PRINT_IP_OPT_SRCLINE (1<<5) - struct perf_tool; struct perf_session *perf_session__new(struct perf_data_file *file, @@ -103,10 +97,6 @@ size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp); struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session, unsigned int type); -void perf_evsel__print_ip(struct perf_evsel *evsel, struct perf_sample *sample, - struct addr_location *al, - unsigned int print_opts, unsigned int stack_depth); - int perf_session__cpu_bitmap(struct perf_session *session, const char *cpu_list, unsigned long *cpu_bitmap); diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 47966a1618c7..20e69edd5006 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -21,13 +21,6 @@ const char *sort_order; const char *field_order; regex_t ignore_callees_regex; int have_ignore_callees = 0; -int sort__need_collapse = 0; -int sort__has_parent = 0; -int sort__has_sym = 0; -int sort__has_dso = 0; -int sort__has_socket = 0; -int sort__has_thread = 0; -int sort__has_comm = 0; enum sort_mode sort__mode = SORT_MODE__NORMAL; /* @@ -244,7 +237,7 @@ sort__sym_cmp(struct hist_entry *left, struct hist_entry *right) * comparing symbol address alone is not enough since it's a * relative address within a dso. */ - if (!sort__has_dso) { + if (!hists__has(left->hists, dso) || hists__has(right->hists, dso)) { ret = sort__dso_cmp(left, right); if (ret != 0) return ret; @@ -2163,7 +2156,7 @@ static int __sort_dimension__add(struct sort_dimension *sd, return -1; if (sd->entry->se_collapse) - sort__need_collapse = 1; + list->need_collapse = 1; sd->taken = 1; @@ -2245,9 +2238,9 @@ static int sort_dimension__add(struct perf_hpp_list *list, const char *tok, pr_err("Invalid regex: %s\n%s", parent_pattern, err); return -EINVAL; } - sort__has_parent = 1; + list->parent = 1; } else if (sd->entry == &sort_sym) { - sort__has_sym = 1; + list->sym = 1; /* * perf diff displays the performance difference amongst * two or more perf.data files. Those files could come @@ -2258,13 +2251,13 @@ static int sort_dimension__add(struct perf_hpp_list *list, const char *tok, sd->entry->se_collapse = sort__sym_sort; } else if (sd->entry == &sort_dso) { - sort__has_dso = 1; + list->dso = 1; } else if (sd->entry == &sort_socket) { - sort__has_socket = 1; + list->socket = 1; } else if (sd->entry == &sort_thread) { - sort__has_thread = 1; + list->thread = 1; } else if (sd->entry == &sort_comm) { - sort__has_comm = 1; + list->comm = 1; } return __sort_dimension__add(sd, list, level); @@ -2289,7 +2282,7 @@ static int sort_dimension__add(struct perf_hpp_list *list, const char *tok, return -EINVAL; if (sd->entry == &sort_sym_from || sd->entry == &sort_sym_to) - sort__has_sym = 1; + list->sym = 1; __sort_dimension__add(sd, list, level); return 0; @@ -2305,7 +2298,7 @@ static int sort_dimension__add(struct perf_hpp_list *list, const char *tok, return -EINVAL; if (sd->entry == &sort_mem_daddr_sym) - sort__has_sym = 1; + list->sym = 1; __sort_dimension__add(sd, list, level); return 0; @@ -2445,6 +2438,9 @@ static char *prefix_if_not_in(const char *pre, char *str) static char *setup_overhead(char *keys) { + if (sort__mode == SORT_MODE__DIFF) + return keys; + keys = prefix_if_not_in("overhead", keys); if (symbol_conf.cumulate_callchain) @@ -2746,10 +2742,10 @@ int setup_sorting(struct perf_evlist *evlist) void reset_output_field(void) { - sort__need_collapse = 0; - sort__has_parent = 0; - sort__has_sym = 0; - sort__has_dso = 0; + perf_hpp_list.need_collapse = 0; + perf_hpp_list.parent = 0; + perf_hpp_list.sym = 0; + perf_hpp_list.dso = 0; field_order = NULL; sort_order = NULL; diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index 3f4e35998119..42927f448bcb 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -31,13 +31,6 @@ extern const char *parent_pattern; extern const char default_sort_order[]; extern regex_t ignore_callees_regex; extern int have_ignore_callees; -extern int sort__need_collapse; -extern int sort__has_dso; -extern int sort__has_parent; -extern int sort__has_sym; -extern int sort__has_socket; -extern int sort__has_thread; -extern int sort__has_comm; extern enum sort_mode sort__mode; extern struct sort_entry sort_comm; extern struct sort_entry sort_dso; diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index 4d9b481cf3b6..ffa1d0653861 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -307,6 +307,7 @@ int perf_stat_process_counter(struct perf_stat_config *config, struct perf_counts_values *aggr = &counter->counts->aggr; struct perf_stat_evsel *ps = counter->priv; u64 *count = counter->counts->aggr.values; + u64 val; int i, ret; aggr->val = aggr->ena = aggr->run = 0; @@ -346,7 +347,8 @@ int perf_stat_process_counter(struct perf_stat_config *config, /* * Save the full runtime - to allow normalization during printout: */ - perf_stat__update_shadow_stats(counter, count, 0); + val = counter->scale * *count; + perf_stat__update_shadow_stats(counter, &val, 0); return 0; } diff --git a/tools/perf/util/strbuf.c b/tools/perf/util/strbuf.c index 8fb73295ec34..f95f682aa2b2 100644 --- a/tools/perf/util/strbuf.c +++ b/tools/perf/util/strbuf.c @@ -1,3 +1,4 @@ +#include "debug.h" #include "cache.h" #include <linux/kernel.h> @@ -17,12 +18,13 @@ int prefixcmp(const char *str, const char *prefix) */ char strbuf_slopbuf[1]; -void strbuf_init(struct strbuf *sb, ssize_t hint) +int strbuf_init(struct strbuf *sb, ssize_t hint) { sb->alloc = sb->len = 0; sb->buf = strbuf_slopbuf; if (hint) - strbuf_grow(sb, hint); + return strbuf_grow(sb, hint); + return 0; } void strbuf_release(struct strbuf *sb) @@ -42,67 +44,104 @@ char *strbuf_detach(struct strbuf *sb, size_t *sz) return res; } -void strbuf_grow(struct strbuf *sb, size_t extra) +int strbuf_grow(struct strbuf *sb, size_t extra) { - if (sb->len + extra + 1 <= sb->len) - die("you want to use way too much memory"); - if (!sb->alloc) - sb->buf = NULL; - ALLOC_GROW(sb->buf, sb->len + extra + 1, sb->alloc); + char *buf; + size_t nr = sb->len + extra + 1; + + if (nr < sb->alloc) + return 0; + + if (nr <= sb->len) + return -E2BIG; + + if (alloc_nr(sb->alloc) > nr) + nr = alloc_nr(sb->alloc); + + /* + * Note that sb->buf == strbuf_slopbuf if sb->alloc == 0, and it is + * a static variable. Thus we have to avoid passing it to realloc. + */ + buf = realloc(sb->alloc ? sb->buf : NULL, nr * sizeof(*buf)); + if (!buf) + return -ENOMEM; + + sb->buf = buf; + sb->alloc = nr; + return 0; } -void strbuf_addch(struct strbuf *sb, int c) +int strbuf_addch(struct strbuf *sb, int c) { - strbuf_grow(sb, 1); + int ret = strbuf_grow(sb, 1); + if (ret) + return ret; + sb->buf[sb->len++] = c; sb->buf[sb->len] = '\0'; + return 0; } -void strbuf_add(struct strbuf *sb, const void *data, size_t len) +int strbuf_add(struct strbuf *sb, const void *data, size_t len) { - strbuf_grow(sb, len); + int ret = strbuf_grow(sb, len); + if (ret) + return ret; + memcpy(sb->buf + sb->len, data, len); - strbuf_setlen(sb, sb->len + len); + return strbuf_setlen(sb, sb->len + len); } -static void strbuf_addv(struct strbuf *sb, const char *fmt, va_list ap) +static int strbuf_addv(struct strbuf *sb, const char *fmt, va_list ap) { - int len; + int len, ret; va_list ap_saved; - if (!strbuf_avail(sb)) - strbuf_grow(sb, 64); + if (!strbuf_avail(sb)) { + ret = strbuf_grow(sb, 64); + if (ret) + return ret; + } va_copy(ap_saved, ap); len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap); if (len < 0) - die("your vsnprintf is broken"); + return len; if (len > strbuf_avail(sb)) { - strbuf_grow(sb, len); + ret = strbuf_grow(sb, len); + if (ret) + return ret; len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap_saved); va_end(ap_saved); if (len > strbuf_avail(sb)) { - die("this should not happen, your vsnprintf is broken"); + pr_debug("this should not happen, your vsnprintf is broken"); + return -EINVAL; } } - strbuf_setlen(sb, sb->len + len); + return strbuf_setlen(sb, sb->len + len); } -void strbuf_addf(struct strbuf *sb, const char *fmt, ...) +int strbuf_addf(struct strbuf *sb, const char *fmt, ...) { va_list ap; + int ret; va_start(ap, fmt); - strbuf_addv(sb, fmt, ap); + ret = strbuf_addv(sb, fmt, ap); va_end(ap); + return ret; } ssize_t strbuf_read(struct strbuf *sb, int fd, ssize_t hint) { size_t oldlen = sb->len; size_t oldalloc = sb->alloc; + int ret; + + ret = strbuf_grow(sb, hint ? hint : 8192); + if (ret) + return ret; - strbuf_grow(sb, hint ? hint : 8192); for (;;) { ssize_t cnt; @@ -112,12 +151,14 @@ ssize_t strbuf_read(struct strbuf *sb, int fd, ssize_t hint) strbuf_release(sb); else strbuf_setlen(sb, oldlen); - return -1; + return cnt; } if (!cnt) break; sb->len += cnt; - strbuf_grow(sb, 8192); + ret = strbuf_grow(sb, 8192); + if (ret) + return ret; } sb->buf[sb->len] = '\0'; diff --git a/tools/perf/util/strbuf.h b/tools/perf/util/strbuf.h index ab9be0fbbd40..54b409297d4a 100644 --- a/tools/perf/util/strbuf.h +++ b/tools/perf/util/strbuf.h @@ -51,7 +51,7 @@ struct strbuf { #define STRBUF_INIT { 0, 0, strbuf_slopbuf } /*----- strbuf life cycle -----*/ -void strbuf_init(struct strbuf *buf, ssize_t hint); +int strbuf_init(struct strbuf *buf, ssize_t hint); void strbuf_release(struct strbuf *buf); char *strbuf_detach(struct strbuf *buf, size_t *); @@ -60,26 +60,31 @@ static inline ssize_t strbuf_avail(const struct strbuf *sb) { return sb->alloc ? sb->alloc - sb->len - 1 : 0; } -void strbuf_grow(struct strbuf *buf, size_t); +int strbuf_grow(struct strbuf *buf, size_t); -static inline void strbuf_setlen(struct strbuf *sb, size_t len) { - if (!sb->alloc) - strbuf_grow(sb, 0); +static inline int strbuf_setlen(struct strbuf *sb, size_t len) { + int ret; + if (!sb->alloc) { + ret = strbuf_grow(sb, 0); + if (ret) + return ret; + } assert(len < sb->alloc); sb->len = len; sb->buf[len] = '\0'; + return 0; } /*----- add data in your buffer -----*/ -void strbuf_addch(struct strbuf *sb, int c); +int strbuf_addch(struct strbuf *sb, int c); -void strbuf_add(struct strbuf *buf, const void *, size_t); -static inline void strbuf_addstr(struct strbuf *sb, const char *s) { - strbuf_add(sb, s, strlen(s)); +int strbuf_add(struct strbuf *buf, const void *, size_t); +static inline int strbuf_addstr(struct strbuf *sb, const char *s) { + return strbuf_add(sb, s, strlen(s)); } __attribute__((format(printf,2,3))) -void strbuf_addf(struct strbuf *sb, const char *fmt, ...); +int strbuf_addf(struct strbuf *sb, const char *fmt, ...); /* XXX: if read fails, any partial read is undone */ ssize_t strbuf_read(struct strbuf *, int fd, ssize_t hint); diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index bc229a74c6a9..87a297dd8901 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -709,17 +709,10 @@ int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name, if (ss->opdshdr.sh_type != SHT_PROGBITS) ss->opdsec = NULL; - if (dso->kernel == DSO_TYPE_USER) { - GElf_Shdr shdr; - ss->adjust_symbols = (ehdr.e_type == ET_EXEC || - ehdr.e_type == ET_REL || - dso__is_vdso(dso) || - elf_section_by_name(elf, &ehdr, &shdr, - ".gnu.prelink_undo", - NULL) != NULL); - } else { + if (dso->kernel == DSO_TYPE_USER) + ss->adjust_symbols = true; + else ss->adjust_symbols = elf__needs_adjust_symbols(ehdr); - } ss->name = strdup(name); if (!ss->name) { @@ -777,7 +770,8 @@ static bool want_demangle(bool is_kernel_sym) return is_kernel_sym ? symbol_conf.demangle_kernel : symbol_conf.demangle; } -void __weak arch__elf_sym_adjust(GElf_Sym *sym __maybe_unused) { } +void __weak arch__sym_update(struct symbol *s __maybe_unused, + GElf_Sym *sym __maybe_unused) { } int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss, struct symsrc *runtime_ss, @@ -954,8 +948,6 @@ int dso__load_sym(struct dso *dso, struct map *map, (sym.st_value & 1)) --sym.st_value; - arch__elf_sym_adjust(&sym); - if (dso->kernel || kmodule) { char dso_name[PATH_MAX]; @@ -1089,6 +1081,8 @@ new_symbol: if (!f) goto out_elf_end; + arch__sym_update(f, &sym); + if (filter && filter(curr_map, f)) symbol__delete(f); else { diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index e7588dc91518..7fb33304fb4e 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -255,40 +255,6 @@ void symbol__delete(struct symbol *sym) free(((void *)sym) - symbol_conf.priv_size); } -size_t symbol__fprintf(struct symbol *sym, FILE *fp) -{ - return fprintf(fp, " %" PRIx64 "-%" PRIx64 " %c %s\n", - sym->start, sym->end, - sym->binding == STB_GLOBAL ? 'g' : - sym->binding == STB_LOCAL ? 'l' : 'w', - sym->name); -} - -size_t symbol__fprintf_symname_offs(const struct symbol *sym, - const struct addr_location *al, FILE *fp) -{ - unsigned long offset; - size_t length; - - if (sym && sym->name) { - length = fprintf(fp, "%s", sym->name); - if (al) { - if (al->addr < sym->end) - offset = al->addr - sym->start; - else - offset = al->addr - al->map->start - sym->start; - length += fprintf(fp, "+0x%lx", offset); - } - return length; - } else - return fprintf(fp, "[unknown]"); -} - -size_t symbol__fprintf_symname(const struct symbol *sym, FILE *fp) -{ - return symbol__fprintf_symname_offs(sym, NULL, fp); -} - void symbols__delete(struct rb_root *symbols) { struct symbol *pos; @@ -335,7 +301,7 @@ static struct symbol *symbols__find(struct rb_root *symbols, u64 ip) if (ip < s->start) n = n->rb_left; - else if (ip >= s->end) + else if (ip > s->end || (ip == s->end && ip != s->start)) n = n->rb_right; else return s; @@ -364,11 +330,6 @@ static struct symbol *symbols__next(struct symbol *sym) return NULL; } -struct symbol_name_rb_node { - struct rb_node rb_node; - struct symbol sym; -}; - static void symbols__insert_by_name(struct rb_root *symbols, struct symbol *sym) { struct rb_node **p = &symbols->rb_node; @@ -452,6 +413,18 @@ void dso__reset_find_symbol_cache(struct dso *dso) } } +void dso__insert_symbol(struct dso *dso, enum map_type type, struct symbol *sym) +{ + symbols__insert(&dso->symbols[type], sym); + + /* update the symbol cache if necessary */ + if (dso->last_find_result[type].addr >= sym->start && + (dso->last_find_result[type].addr < sym->end || + sym->start == sym->end)) { + dso->last_find_result[type].symbol = sym; + } +} + struct symbol *dso__find_symbol(struct dso *dso, enum map_type type, u64 addr) { @@ -497,21 +470,6 @@ void dso__sort_by_name(struct dso *dso, enum map_type type) &dso->symbols[type]); } -size_t dso__fprintf_symbols_by_name(struct dso *dso, - enum map_type type, FILE *fp) -{ - size_t ret = 0; - struct rb_node *nd; - struct symbol_name_rb_node *pos; - - for (nd = rb_first(&dso->symbol_names[type]); nd; nd = rb_next(nd)) { - pos = rb_entry(nd, struct symbol_name_rb_node, rb_node); - fprintf(fp, "%s\n", pos->sym.name); - } - - return ret; -} - int modules__parse(const char *filename, void *arg, int (*process_module)(void *arg, const char *name, u64 start)) @@ -1262,8 +1220,8 @@ static int kallsyms__delta(struct map *map, const char *filename, u64 *delta) return 0; } -int dso__load_kallsyms(struct dso *dso, const char *filename, - struct map *map, symbol_filter_t filter) +int __dso__load_kallsyms(struct dso *dso, const char *filename, + struct map *map, bool no_kcore, symbol_filter_t filter) { u64 delta = 0; @@ -1284,12 +1242,18 @@ int dso__load_kallsyms(struct dso *dso, const char *filename, else dso->symtab_type = DSO_BINARY_TYPE__KALLSYMS; - if (!dso__load_kcore(dso, map, filename)) + if (!no_kcore && !dso__load_kcore(dso, map, filename)) return dso__split_kallsyms_for_kcore(dso, map, filter); else return dso__split_kallsyms(dso, map, delta, filter); } +int dso__load_kallsyms(struct dso *dso, const char *filename, + struct map *map, symbol_filter_t filter) +{ + return __dso__load_kallsyms(dso, filename, map, false, filter); +} + static int dso__load_perf_map(struct dso *dso, struct map *map, symbol_filter_t filter) { @@ -1644,25 +1608,27 @@ out: return err; } +static bool visible_dir_filter(const char *name, struct dirent *d) +{ + if (d->d_type != DT_DIR) + return false; + return lsdir_no_dot_filter(name, d); +} + static int find_matching_kcore(struct map *map, char *dir, size_t dir_sz) { char kallsyms_filename[PATH_MAX]; - struct dirent *dent; int ret = -1; - DIR *d; + struct strlist *dirs; + struct str_node *nd; - d = opendir(dir); - if (!d) + dirs = lsdir(dir, visible_dir_filter); + if (!dirs) return -1; - while (1) { - dent = readdir(d); - if (!dent) - break; - if (dent->d_type != DT_DIR) - continue; + strlist__for_each(nd, dirs) { scnprintf(kallsyms_filename, sizeof(kallsyms_filename), - "%s/%s/kallsyms", dir, dent->d_name); + "%s/%s/kallsyms", dir, nd->s); if (!validate_kcore_addresses(kallsyms_filename, map)) { strlcpy(dir, kallsyms_filename, dir_sz); ret = 0; @@ -1670,7 +1636,7 @@ static int find_matching_kcore(struct map *map, char *dir, size_t dir_sz) } } - closedir(d); + strlist__delete(dirs); return ret; } @@ -1678,7 +1644,7 @@ static int find_matching_kcore(struct map *map, char *dir, size_t dir_sz) static char *dso__find_kallsyms(struct dso *dso, struct map *map) { u8 host_build_id[BUILD_ID_SIZE]; - char sbuild_id[BUILD_ID_SIZE * 2 + 1]; + char sbuild_id[SBUILD_ID_SIZE]; bool is_host = false; char path[PATH_MAX]; diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index c8b7544d9267..2b5e4ed76fcb 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -55,6 +55,7 @@ struct symbol { u16 namelen; u8 binding; bool ignore; + u8 arch_sym; char name[0]; }; @@ -140,6 +141,11 @@ struct symbol_conf { extern struct symbol_conf symbol_conf; +struct symbol_name_rb_node { + struct rb_node rb_node; + struct symbol sym; +}; + static inline int __symbol__join_symfs(char *bf, size_t size, const char *path) { return path__join(bf, size, symbol_conf.symfs, path); @@ -235,9 +241,14 @@ int dso__load_vmlinux(struct dso *dso, struct map *map, symbol_filter_t filter); int dso__load_vmlinux_path(struct dso *dso, struct map *map, symbol_filter_t filter); +int __dso__load_kallsyms(struct dso *dso, const char *filename, struct map *map, + bool no_kcore, symbol_filter_t filter); int dso__load_kallsyms(struct dso *dso, const char *filename, struct map *map, symbol_filter_t filter); +void dso__insert_symbol(struct dso *dso, enum map_type type, + struct symbol *sym); + struct symbol *dso__find_symbol(struct dso *dso, enum map_type type, u64 addr); struct symbol *dso__find_symbol_by_name(struct dso *dso, enum map_type type, @@ -262,8 +273,14 @@ int symbol__init(struct perf_env *env); void symbol__exit(void); void symbol__elf_init(void); struct symbol *symbol__new(u64 start, u64 len, u8 binding, const char *name); +size_t __symbol__fprintf_symname_offs(const struct symbol *sym, + const struct addr_location *al, + bool unknown_as_addr, FILE *fp); size_t symbol__fprintf_symname_offs(const struct symbol *sym, const struct addr_location *al, FILE *fp); +size_t __symbol__fprintf_symname(const struct symbol *sym, + const struct addr_location *al, + bool unknown_as_addr, FILE *fp); size_t symbol__fprintf_symname(const struct symbol *sym, FILE *fp); size_t symbol__fprintf(struct symbol *sym, FILE *fp); bool symbol_type__is_a(char symbol_type, enum map_type map_type); @@ -310,7 +327,7 @@ int setup_intlist(struct intlist **list, const char *list_str, #ifdef HAVE_LIBELF_SUPPORT bool elf__needs_adjust_symbols(GElf_Ehdr ehdr); -void arch__elf_sym_adjust(GElf_Sym *sym); +void arch__sym_update(struct symbol *s, GElf_Sym *sym); #endif #define SYMBOL_A 0 diff --git a/tools/perf/util/symbol_fprintf.c b/tools/perf/util/symbol_fprintf.c new file mode 100644 index 000000000000..a680bdaa65dc --- /dev/null +++ b/tools/perf/util/symbol_fprintf.c @@ -0,0 +1,71 @@ +#include <elf.h> +#include <inttypes.h> +#include <stdio.h> + +#include "symbol.h" + +size_t symbol__fprintf(struct symbol *sym, FILE *fp) +{ + return fprintf(fp, " %" PRIx64 "-%" PRIx64 " %c %s\n", + sym->start, sym->end, + sym->binding == STB_GLOBAL ? 'g' : + sym->binding == STB_LOCAL ? 'l' : 'w', + sym->name); +} + +size_t __symbol__fprintf_symname_offs(const struct symbol *sym, + const struct addr_location *al, + bool unknown_as_addr, FILE *fp) +{ + unsigned long offset; + size_t length; + + if (sym && sym->name) { + length = fprintf(fp, "%s", sym->name); + if (al) { + if (al->addr < sym->end) + offset = al->addr - sym->start; + else + offset = al->addr - al->map->start - sym->start; + length += fprintf(fp, "+0x%lx", offset); + } + return length; + } else if (al && unknown_as_addr) + return fprintf(fp, "[%#" PRIx64 "]", al->addr); + else + return fprintf(fp, "[unknown]"); +} + +size_t symbol__fprintf_symname_offs(const struct symbol *sym, + const struct addr_location *al, + FILE *fp) +{ + return __symbol__fprintf_symname_offs(sym, al, false, fp); +} + +size_t __symbol__fprintf_symname(const struct symbol *sym, + const struct addr_location *al, + bool unknown_as_addr, FILE *fp) +{ + return __symbol__fprintf_symname_offs(sym, al, unknown_as_addr, fp); +} + +size_t symbol__fprintf_symname(const struct symbol *sym, FILE *fp) +{ + return __symbol__fprintf_symname_offs(sym, NULL, false, fp); +} + +size_t dso__fprintf_symbols_by_name(struct dso *dso, + enum map_type type, FILE *fp) +{ + size_t ret = 0; + struct rb_node *nd; + struct symbol_name_rb_node *pos; + + for (nd = rb_first(&dso->symbol_names[type]); nd; nd = rb_next(nd)) { + pos = rb_entry(nd, struct symbol_name_rb_node, rb_node); + fprintf(fp, "%s\n", pos->sym.name); + } + + return ret; +} diff --git a/tools/perf/util/syscalltbl.c b/tools/perf/util/syscalltbl.c new file mode 100644 index 000000000000..bbb4c1957578 --- /dev/null +++ b/tools/perf/util/syscalltbl.c @@ -0,0 +1,134 @@ +/* + * System call table mapper + * + * (C) 2016 Arnaldo Carvalho de Melo <acme@redhat.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include "syscalltbl.h" +#include <stdlib.h> + +#ifdef HAVE_SYSCALL_TABLE +#include <linux/compiler.h> +#include <string.h> +#include "util.h" + +#if defined(__x86_64__) +#include <asm/syscalls_64.c> +const int syscalltbl_native_max_id = SYSCALLTBL_x86_64_MAX_ID; +static const char **syscalltbl_native = syscalltbl_x86_64; +#endif + +struct syscall { + int id; + const char *name; +}; + +static int syscallcmpname(const void *vkey, const void *ventry) +{ + const char *key = vkey; + const struct syscall *entry = ventry; + + return strcmp(key, entry->name); +} + +static int syscallcmp(const void *va, const void *vb) +{ + const struct syscall *a = va, *b = vb; + + return strcmp(a->name, b->name); +} + +static int syscalltbl__init_native(struct syscalltbl *tbl) +{ + int nr_entries = 0, i, j; + struct syscall *entries; + + for (i = 0; i <= syscalltbl_native_max_id; ++i) + if (syscalltbl_native[i]) + ++nr_entries; + + entries = tbl->syscalls.entries = malloc(sizeof(struct syscall) * nr_entries); + if (tbl->syscalls.entries == NULL) + return -1; + + for (i = 0, j = 0; i <= syscalltbl_native_max_id; ++i) { + if (syscalltbl_native[i]) { + entries[j].name = syscalltbl_native[i]; + entries[j].id = i; + ++j; + } + } + + qsort(tbl->syscalls.entries, nr_entries, sizeof(struct syscall), syscallcmp); + tbl->syscalls.nr_entries = nr_entries; + return 0; +} + +struct syscalltbl *syscalltbl__new(void) +{ + struct syscalltbl *tbl = malloc(sizeof(*tbl)); + if (tbl) { + if (syscalltbl__init_native(tbl)) { + free(tbl); + return NULL; + } + } + return tbl; +} + +void syscalltbl__delete(struct syscalltbl *tbl) +{ + zfree(&tbl->syscalls.entries); + free(tbl); +} + +const char *syscalltbl__name(const struct syscalltbl *tbl __maybe_unused, int id) +{ + return id <= syscalltbl_native_max_id ? syscalltbl_native[id]: NULL; +} + +int syscalltbl__id(struct syscalltbl *tbl, const char *name) +{ + struct syscall *sc = bsearch(name, tbl->syscalls.entries, + tbl->syscalls.nr_entries, sizeof(*sc), + syscallcmpname); + + return sc ? sc->id : -1; +} + +#else /* HAVE_SYSCALL_TABLE */ + +#include <libaudit.h> + +struct syscalltbl *syscalltbl__new(void) +{ + struct syscalltbl *tbl = malloc(sizeof(*tbl)); + if (tbl) + tbl->audit_machine = audit_detect_machine(); + return tbl; +} + +void syscalltbl__delete(struct syscalltbl *tbl) +{ + free(tbl); +} + +const char *syscalltbl__name(const struct syscalltbl *tbl, int id) +{ + return audit_syscall_to_name(id, tbl->audit_machine); +} + +int syscalltbl__id(struct syscalltbl *tbl, const char *name) +{ + return audit_name_to_syscall(name, tbl->audit_machine); +} +#endif /* HAVE_SYSCALL_TABLE */ diff --git a/tools/perf/util/syscalltbl.h b/tools/perf/util/syscalltbl.h new file mode 100644 index 000000000000..e2951510484f --- /dev/null +++ b/tools/perf/util/syscalltbl.h @@ -0,0 +1,20 @@ +#ifndef __PERF_SYSCALLTBL_H +#define __PERF_SYSCALLTBL_H + +struct syscalltbl { + union { + int audit_machine; + struct { + int nr_entries; + void *entries; + } syscalls; + }; +}; + +struct syscalltbl *syscalltbl__new(void); +void syscalltbl__delete(struct syscalltbl *tbl); + +const char *syscalltbl__name(const struct syscalltbl *tbl, int id); +int syscalltbl__id(struct syscalltbl *tbl, const char *name); + +#endif /* __PERF_SYSCALLTBL_H */ diff --git a/tools/perf/util/thread-stack.c b/tools/perf/util/thread-stack.c index 679688e70ae7..825086aa9a08 100644 --- a/tools/perf/util/thread-stack.c +++ b/tools/perf/util/thread-stack.c @@ -22,44 +22,9 @@ #include "debug.h" #include "symbol.h" #include "comm.h" +#include "call-path.h" #include "thread-stack.h" -#define CALL_PATH_BLOCK_SHIFT 8 -#define CALL_PATH_BLOCK_SIZE (1 << CALL_PATH_BLOCK_SHIFT) -#define CALL_PATH_BLOCK_MASK (CALL_PATH_BLOCK_SIZE - 1) - -struct call_path_block { - struct call_path cp[CALL_PATH_BLOCK_SIZE]; - struct list_head node; -}; - -/** - * struct call_path_root - root of all call paths. - * @call_path: root call path - * @blocks: list of blocks to store call paths - * @next: next free space - * @sz: number of spaces - */ -struct call_path_root { - struct call_path call_path; - struct list_head blocks; - size_t next; - size_t sz; -}; - -/** - * struct call_return_processor - provides a call-back to consume call-return - * information. - * @cpr: call path root - * @process: call-back that accepts call/return information - * @data: anonymous data for call-back - */ -struct call_return_processor { - struct call_path_root *cpr; - int (*process)(struct call_return *cr, void *data); - void *data; -}; - #define STACK_GROWTH 2048 /** @@ -335,108 +300,6 @@ void thread_stack__sample(struct thread *thread, struct ip_callchain *chain, chain->ips[i] = thread->ts->stack[thread->ts->cnt - i].ret_addr; } -static void call_path__init(struct call_path *cp, struct call_path *parent, - struct symbol *sym, u64 ip, bool in_kernel) -{ - cp->parent = parent; - cp->sym = sym; - cp->ip = sym ? 0 : ip; - cp->db_id = 0; - cp->in_kernel = in_kernel; - RB_CLEAR_NODE(&cp->rb_node); - cp->children = RB_ROOT; -} - -static struct call_path_root *call_path_root__new(void) -{ - struct call_path_root *cpr; - - cpr = zalloc(sizeof(struct call_path_root)); - if (!cpr) - return NULL; - call_path__init(&cpr->call_path, NULL, NULL, 0, false); - INIT_LIST_HEAD(&cpr->blocks); - return cpr; -} - -static void call_path_root__free(struct call_path_root *cpr) -{ - struct call_path_block *pos, *n; - - list_for_each_entry_safe(pos, n, &cpr->blocks, node) { - list_del(&pos->node); - free(pos); - } - free(cpr); -} - -static struct call_path *call_path__new(struct call_path_root *cpr, - struct call_path *parent, - struct symbol *sym, u64 ip, - bool in_kernel) -{ - struct call_path_block *cpb; - struct call_path *cp; - size_t n; - - if (cpr->next < cpr->sz) { - cpb = list_last_entry(&cpr->blocks, struct call_path_block, - node); - } else { - cpb = zalloc(sizeof(struct call_path_block)); - if (!cpb) - return NULL; - list_add_tail(&cpb->node, &cpr->blocks); - cpr->sz += CALL_PATH_BLOCK_SIZE; - } - - n = cpr->next++ & CALL_PATH_BLOCK_MASK; - cp = &cpb->cp[n]; - - call_path__init(cp, parent, sym, ip, in_kernel); - - return cp; -} - -static struct call_path *call_path__findnew(struct call_path_root *cpr, - struct call_path *parent, - struct symbol *sym, u64 ip, u64 ks) -{ - struct rb_node **p; - struct rb_node *node_parent = NULL; - struct call_path *cp; - bool in_kernel = ip >= ks; - - if (sym) - ip = 0; - - if (!parent) - return call_path__new(cpr, parent, sym, ip, in_kernel); - - p = &parent->children.rb_node; - while (*p != NULL) { - node_parent = *p; - cp = rb_entry(node_parent, struct call_path, rb_node); - - if (cp->sym == sym && cp->ip == ip) - return cp; - - if (sym < cp->sym || (sym == cp->sym && ip < cp->ip)) - p = &(*p)->rb_left; - else - p = &(*p)->rb_right; - } - - cp = call_path__new(cpr, parent, sym, ip, in_kernel); - if (!cp) - return NULL; - - rb_link_node(&cp->rb_node, node_parent, p); - rb_insert_color(&cp->rb_node, &parent->children); - - return cp; -} - struct call_return_processor * call_return_processor__new(int (*process)(struct call_return *cr, void *data), void *data) diff --git a/tools/perf/util/thread-stack.h b/tools/perf/util/thread-stack.h index e1528f1374c3..ad44c7944b8e 100644 --- a/tools/perf/util/thread-stack.h +++ b/tools/perf/util/thread-stack.h @@ -19,17 +19,16 @@ #include <sys/types.h> #include <linux/types.h> -#include <linux/rbtree.h> struct thread; struct comm; struct ip_callchain; struct symbol; struct dso; -struct call_return_processor; struct comm; struct perf_sample; struct addr_location; +struct call_path; /* * Call/Return flags. @@ -69,26 +68,16 @@ struct call_return { }; /** - * struct call_path - node in list of calls leading to a function call. - * @parent: call path to the parent function call - * @sym: symbol of function called - * @ip: only if sym is null, the ip of the function - * @db_id: id used for db-export - * @in_kernel: whether function is a in the kernel - * @rb_node: node in parent's tree of called functions - * @children: tree of call paths of functions called - * - * In combination with the call_return structure, the call_path structure - * defines a context-sensitve call-graph. + * struct call_return_processor - provides a call-back to consume call-return + * information. + * @cpr: call path root + * @process: call-back that accepts call/return information + * @data: anonymous data for call-back */ -struct call_path { - struct call_path *parent; - struct symbol *sym; - u64 ip; - u64 db_id; - bool in_kernel; - struct rb_node rb_node; - struct rb_root children; +struct call_return_processor { + struct call_path_root *cpr; + int (*process)(struct call_return *cr, void *data); + void *data; }; int thread_stack__event(struct thread *thread, u32 flags, u64 from_ip, diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index dfd00c6dad6e..45fcb715a36b 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -10,6 +10,8 @@ #include "comm.h" #include "unwind.h" +#include <api/fs/fs.h> + int thread__init_map_groups(struct thread *thread, struct machine *machine) { struct thread *leader; @@ -153,6 +155,23 @@ int __thread__set_comm(struct thread *thread, const char *str, u64 timestamp, return 0; } +int thread__set_comm_from_proc(struct thread *thread) +{ + char path[64]; + char *comm = NULL; + size_t sz; + int err = -1; + + if (!(snprintf(path, sizeof(path), "%d/task/%d/comm", + thread->pid_, thread->tid) >= (int)sizeof(path)) && + procfs__read_str(path, &comm, &sz) == 0) { + comm[sz - 1] = '\0'; + err = thread__set_comm(thread, comm, 0); + } + + return err; +} + const char *thread__comm_str(const struct thread *thread) { const struct comm *comm = thread__comm(thread); @@ -233,7 +252,7 @@ void thread__find_cpumode_addr_location(struct thread *thread, struct addr_location *al) { size_t i; - const u8 const cpumodes[] = { + const u8 cpumodes[] = { PERF_RECORD_MISC_USER, PERF_RECORD_MISC_KERNEL, PERF_RECORD_MISC_GUEST_USER, diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h index a0ac0317affb..45fba13c800b 100644 --- a/tools/perf/util/thread.h +++ b/tools/perf/util/thread.h @@ -9,6 +9,9 @@ #include "symbol.h" #include <strlist.h> #include <intlist.h> +#ifdef HAVE_LIBUNWIND_SUPPORT +#include <libunwind.h> +#endif struct thread_stack; @@ -32,6 +35,9 @@ struct thread { void *priv; struct thread_stack *ts; +#ifdef HAVE_LIBUNWIND_SUPPORT + unw_addr_space_t addr_space; +#endif }; struct machine; @@ -65,6 +71,8 @@ static inline int thread__set_comm(struct thread *thread, const char *comm, return __thread__set_comm(thread, comm, timestamp, false); } +int thread__set_comm_from_proc(struct thread *thread); + int thread__comm_len(struct thread *thread); struct comm *thread__comm(const struct thread *thread); struct comm *thread__exec_comm(const struct thread *thread); diff --git a/tools/perf/util/thread_map.c b/tools/perf/util/thread_map.c index 08afc6909953..5654fe15e036 100644 --- a/tools/perf/util/thread_map.c +++ b/tools/perf/util/thread_map.c @@ -94,7 +94,7 @@ struct thread_map *thread_map__new_by_uid(uid_t uid) DIR *proc; int max_threads = 32, items, i; char path[256]; - struct dirent dirent, *next, **namelist = NULL; + struct dirent *dirent, **namelist = NULL; struct thread_map *threads = thread_map__alloc(max_threads); if (threads == NULL) @@ -107,16 +107,16 @@ struct thread_map *thread_map__new_by_uid(uid_t uid) threads->nr = 0; atomic_set(&threads->refcnt, 1); - while (!readdir_r(proc, &dirent, &next) && next) { + while ((dirent = readdir(proc)) != NULL) { char *end; bool grow = false; struct stat st; - pid_t pid = strtol(dirent.d_name, &end, 10); + pid_t pid = strtol(dirent->d_name, &end, 10); if (*end) /* only interested in proper numerical dirents */ continue; - snprintf(path, sizeof(path), "/proc/%s", dirent.d_name); + snprintf(path, sizeof(path), "/proc/%s", dirent->d_name); if (stat(path, &st) != 0) continue; @@ -260,7 +260,7 @@ struct thread_map *thread_map__new_dummy(void) return threads; } -static struct thread_map *thread_map__new_by_tid_str(const char *tid_str) +struct thread_map *thread_map__new_by_tid_str(const char *tid_str) { struct thread_map *threads = NULL, *nt; int ntasks = 0; @@ -436,3 +436,15 @@ struct thread_map *thread_map__new_event(struct thread_map_event *event) return threads; } + +bool thread_map__has(struct thread_map *threads, pid_t pid) +{ + int i; + + for (i = 0; i < threads->nr; ++i) { + if (threads->map[i].pid == pid) + return true; + } + + return false; +} diff --git a/tools/perf/util/thread_map.h b/tools/perf/util/thread_map.h index 85e4c7c4fbde..bd3b971588da 100644 --- a/tools/perf/util/thread_map.h +++ b/tools/perf/util/thread_map.h @@ -31,6 +31,8 @@ void thread_map__put(struct thread_map *map); struct thread_map *thread_map__new_str(const char *pid, const char *tid, uid_t uid); +struct thread_map *thread_map__new_by_tid_str(const char *tid_str); + size_t thread_map__fprintf(struct thread_map *threads, FILE *fp); static inline int thread_map__nr(struct thread_map *threads) @@ -55,4 +57,5 @@ static inline char *thread_map__comm(struct thread_map *map, int thread) } void thread_map__read_comms(struct thread_map *threads); +bool thread_map__has(struct thread_map *threads, pid_t pid); #endif /* __PERF_THREAD_MAP_H */ diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h index 55de4cffcd4e..ac2590a3de2d 100644 --- a/tools/perf/util/tool.h +++ b/tools/perf/util/tool.h @@ -57,6 +57,7 @@ struct perf_tool { id_index, auxtrace_info, auxtrace_error, + time_conv, thread_map, cpu_map, stat_config, diff --git a/tools/perf/util/trigger.h b/tools/perf/util/trigger.h new file mode 100644 index 000000000000..e97d7016d771 --- /dev/null +++ b/tools/perf/util/trigger.h @@ -0,0 +1,94 @@ +#ifndef __TRIGGER_H_ +#define __TRIGGER_H_ 1 + +#include "util/debug.h" +#include "asm/bug.h" + +/* + * Use trigger to model operations which need to be executed when + * an event (a signal, for example) is observed. + * + * States and transits: + * + * + * OFF--(on)--> READY --(hit)--> HIT + * ^ | + * | (ready) + * | | + * \_____________/ + * + * is_hit and is_ready are two key functions to query the state of + * a trigger. is_hit means the event already happen; is_ready means the + * trigger is waiting for the event. + */ + +struct trigger { + volatile enum { + TRIGGER_ERROR = -2, + TRIGGER_OFF = -1, + TRIGGER_READY = 0, + TRIGGER_HIT = 1, + } state; + const char *name; +}; + +#define TRIGGER_WARN_ONCE(t, exp) \ + WARN_ONCE(t->state != exp, "trigger '%s' state transist error: %d in %s()\n", \ + t->name, t->state, __func__) + +static inline bool trigger_is_available(struct trigger *t) +{ + return t->state >= 0; +} + +static inline bool trigger_is_error(struct trigger *t) +{ + return t->state <= TRIGGER_ERROR; +} + +static inline void trigger_on(struct trigger *t) +{ + TRIGGER_WARN_ONCE(t, TRIGGER_OFF); + t->state = TRIGGER_READY; +} + +static inline void trigger_ready(struct trigger *t) +{ + if (!trigger_is_available(t)) + return; + t->state = TRIGGER_READY; +} + +static inline void trigger_hit(struct trigger *t) +{ + if (!trigger_is_available(t)) + return; + TRIGGER_WARN_ONCE(t, TRIGGER_READY); + t->state = TRIGGER_HIT; +} + +static inline void trigger_off(struct trigger *t) +{ + if (!trigger_is_available(t)) + return; + t->state = TRIGGER_OFF; +} + +static inline void trigger_error(struct trigger *t) +{ + t->state = TRIGGER_ERROR; +} + +static inline bool trigger_is_ready(struct trigger *t) +{ + return t->state == TRIGGER_READY; +} + +static inline bool trigger_is_hit(struct trigger *t) +{ + return t->state == TRIGGER_HIT; +} + +#define DEFINE_TRIGGER(n) \ +struct trigger n = {.state = TRIGGER_OFF, .name = #n} +#endif diff --git a/tools/perf/util/tsc.h b/tools/perf/util/tsc.h index a8b78f1b3243..d5b11e2b85e0 100644 --- a/tools/perf/util/tsc.h +++ b/tools/perf/util/tsc.h @@ -3,10 +3,29 @@ #include <linux/types.h> -#include "../arch/x86/util/tsc.h" +#include "event.h" + +struct perf_tsc_conversion { + u16 time_shift; + u32 time_mult; + u64 time_zero; +}; +struct perf_event_mmap_page; + +int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc, + struct perf_tsc_conversion *tc); u64 perf_time_to_tsc(u64 ns, struct perf_tsc_conversion *tc); u64 tsc_to_perf_time(u64 cyc, struct perf_tsc_conversion *tc); u64 rdtsc(void); +struct perf_event_mmap_page; +struct perf_tool; +struct machine; + +int perf_event__synth_time_conv(const struct perf_event_mmap_page *pc, + struct perf_tool *tool, + perf_event__handler_t process, + struct machine *machine); + #endif diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c index ee7e372297e5..63687d3a344e 100644 --- a/tools/perf/util/unwind-libunwind.c +++ b/tools/perf/util/unwind-libunwind.c @@ -32,6 +32,7 @@ #include "symbol.h" #include "util.h" #include "debug.h" +#include "asm/bug.h" extern int UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as, @@ -580,43 +581,33 @@ static unw_accessors_t accessors = { int unwind__prepare_access(struct thread *thread) { - unw_addr_space_t addr_space; - if (callchain_param.record_mode != CALLCHAIN_DWARF) return 0; - addr_space = unw_create_addr_space(&accessors, 0); - if (!addr_space) { + thread->addr_space = unw_create_addr_space(&accessors, 0); + if (!thread->addr_space) { pr_err("unwind: Can't create unwind address space.\n"); return -ENOMEM; } - unw_set_caching_policy(addr_space, UNW_CACHE_GLOBAL); - thread__set_priv(thread, addr_space); - + unw_set_caching_policy(thread->addr_space, UNW_CACHE_GLOBAL); return 0; } void unwind__flush_access(struct thread *thread) { - unw_addr_space_t addr_space; - if (callchain_param.record_mode != CALLCHAIN_DWARF) return; - addr_space = thread__priv(thread); - unw_flush_cache(addr_space, 0, 0); + unw_flush_cache(thread->addr_space, 0, 0); } void unwind__finish_access(struct thread *thread) { - unw_addr_space_t addr_space; - if (callchain_param.record_mode != CALLCHAIN_DWARF) return; - addr_space = thread__priv(thread); - unw_destroy_addr_space(addr_space); + unw_destroy_addr_space(thread->addr_space); } static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb, @@ -639,7 +630,9 @@ static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb, * unwind itself. */ if (max_stack - 1 > 0) { - addr_space = thread__priv(ui->thread); + WARN_ONCE(!ui->thread, "WARNING: ui->thread is NULL"); + addr_space = ui->thread->addr_space; + if (addr_space == NULL) return -1; diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index b7766c577b01..eab077ad6ca9 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -33,6 +33,8 @@ struct callchain_param callchain_param = { unsigned int page_size; int cacheline_size; +unsigned int sysctl_perf_event_max_stack = PERF_MAX_STACK_DEPTH; + bool test_attr__enabled; bool perf_host = true; @@ -117,6 +119,40 @@ int rm_rf(char *path) return rmdir(path); } +/* A filter which removes dot files */ +bool lsdir_no_dot_filter(const char *name __maybe_unused, struct dirent *d) +{ + return d->d_name[0] != '.'; +} + +/* lsdir reads a directory and store it in strlist */ +struct strlist *lsdir(const char *name, + bool (*filter)(const char *, struct dirent *)) +{ + struct strlist *list = NULL; + DIR *dir; + struct dirent *d; + + dir = opendir(name); + if (!dir) + return NULL; + + list = strlist__new(NULL, NULL); + if (!list) { + errno = ENOMEM; + goto out; + } + + while ((d = readdir(dir)) != NULL) { + if (!filter || filter(name, d)) + strlist__add(list, d->d_name); + } + +out: + closedir(dir); + return list; +} + static int slow_copyfile(const char *from, const char *to) { int err = -1; @@ -471,7 +507,6 @@ int parse_callchain_record(const char *arg, struct callchain_param *param) "needed for --call-graph fp\n"); break; -#ifdef HAVE_DWARF_UNWIND_SUPPORT /* Dwarf style */ } else if (!strncmp(name, "dwarf", sizeof("dwarf"))) { const unsigned long default_stack_dump_size = 8192; @@ -487,7 +522,6 @@ int parse_callchain_record(const char *arg, struct callchain_param *param) ret = get_stack_size(tok, &size); param->dump_size = size; } -#endif /* HAVE_DWARF_UNWIND_SUPPORT */ } else if (!strncmp(name, "lbr", sizeof("lbr"))) { if (!strtok_r(NULL, ",", &saveptr)) { param->record_mode = CALLCHAIN_LBR; diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index 8298d607c738..7651633a8dc7 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -79,6 +79,7 @@ #include <termios.h> #include <linux/bitops.h> #include <termios.h> +#include "strlist.h" extern const char *graph_line; extern const char *graph_dotted_line; @@ -159,12 +160,6 @@ static inline char *gitstrchrnul(const char *s, int c) } #endif -/* - * Wrappers: - */ -void *xrealloc(void *ptr, size_t size) __attribute__((weak)); - - static inline void *zalloc(size_t size) { return calloc(1, size); @@ -222,6 +217,8 @@ static inline int sane_case(int x, int high) int mkdir_p(char *path, mode_t mode); int rm_rf(char *path); +struct strlist *lsdir(const char *name, bool (*filter)(const char *, struct dirent *)); +bool lsdir_no_dot_filter(const char *name, struct dirent *d); int copyfile(const char *from, const char *to); int copyfile_mode(const char *from, const char *to, mode_t mode); int copyfile_offset(int fromfd, loff_t from_ofs, int tofd, loff_t to_ofs, u64 size); @@ -254,11 +251,17 @@ int hex2u64(const char *ptr, u64 *val); char *ltrim(char *s); char *rtrim(char *s); +static inline char *trim(char *s) +{ + return ltrim(rtrim(s)); +} + void dump_stack(void); void sighandler_dump_stack(int sig); extern unsigned int page_size; extern int cacheline_size; +extern unsigned int sysctl_perf_event_max_stack; struct parse_tag { char tag; diff --git a/tools/perf/util/wrapper.c b/tools/perf/util/wrapper.c deleted file mode 100644 index 5f1a07c4b87b..000000000000 --- a/tools/perf/util/wrapper.c +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Various trivial helper wrappers around standard functions - */ -#include "cache.h" - -/* - * There's no pack memory to release - but stay close to the Git - * version so wrap this away: - */ -static inline void release_pack_memory(size_t size __maybe_unused, - int flag __maybe_unused) -{ -} - -void *xrealloc(void *ptr, size_t size) -{ - void *ret = realloc(ptr, size); - if (!ret && !size) - ret = realloc(ptr, 1); - if (!ret) { - release_pack_memory(size, -1); - ret = realloc(ptr, size); - if (!ret && !size) - ret = realloc(ptr, 1); - if (!ret) - die("Out of memory, realloc failed"); - } - return ret; -} diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index b04afc3295df..ff9e5f20a5a7 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -19,6 +19,7 @@ TARGETS += powerpc TARGETS += pstore TARGETS += ptrace TARGETS += seccomp +TARGETS += sigaltstack TARGETS += size TARGETS += static_keys TARGETS += sysctl diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh b/tools/testing/selftests/rcutorture/bin/jitter.sh new file mode 100755 index 000000000000..3633828375e3 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/jitter.sh @@ -0,0 +1,90 @@ +#!/bin/bash +# +# Alternate sleeping and spinning on randomly selected CPUs. The purpose +# of this script is to inflict random OS jitter on a concurrently running +# test. +# +# Usage: jitter.sh me duration [ sleepmax [ spinmax ] ] +# +# me: Random-number-generator seed salt. +# duration: Time to run in seconds. +# sleepmax: Maximum microseconds to sleep, defaults to one second. +# spinmax: Maximum microseconds to spin, defaults to one millisecond. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, you can access it online at +# http://www.gnu.org/licenses/gpl-2.0.html. +# +# Copyright (C) IBM Corporation, 2016 +# +# Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + +me=$(($1 * 1000)) +duration=$2 +sleepmax=${3-1000000} +spinmax=${4-1000} + +n=1 + +starttime=`awk 'BEGIN { print systime(); }' < /dev/null` + +while : +do + # Check for done. + t=`awk -v s=$starttime 'BEGIN { print systime() - s; }' < /dev/null` + if test "$t" -gt "$duration" + then + exit 0; + fi + + # Set affinity to randomly selected CPU + cpus=`ls /sys/devices/system/cpu/*/online | + sed -e 's,/[^/]*$,,' -e 's/^[^0-9]*//' | + grep -v '^0*$'` + cpumask=`awk -v cpus="$cpus" -v me=$me -v n=$n 'BEGIN { + srand(n + me + systime()); + ncpus = split(cpus, ca); + curcpu = ca[int(rand() * ncpus + 1)]; + mask = lshift(1, curcpu); + if (mask + 0 <= 0) + mask = 1; + printf("%#x\n", mask); + }' < /dev/null` + n=$(($n+1)) + if ! taskset -p $cpumask $$ > /dev/null 2>&1 + then + echo taskset failure: '"taskset -p ' $cpumask $$ '"' + exit 1 + fi + + # Sleep a random duration + sleeptime=`awk -v me=$me -v n=$n -v sleepmax=$sleepmax 'BEGIN { + srand(n + me + systime()); + printf("%06d", int(rand() * sleepmax)); + }' < /dev/null` + n=$(($n+1)) + sleep .$sleeptime + + # Spin a random duration + limit=`awk -v me=$me -v n=$n -v spinmax=$spinmax 'BEGIN { + srand(n + me + systime()); + printf("%06d", int(rand() * spinmax)); + }' < /dev/null` + n=$(($n+1)) + for i in {1..$limit} + do + echo > /dev/null + done +done + +exit 1 diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh new file mode 100755 index 000000000000..f79b0e9e84fc --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh @@ -0,0 +1,121 @@ +#!/bin/bash +# +# Analyze a given results directory for rcuperf performance measurements, +# looking for ftrace data. Exits with 0 if data was found, analyzed, and +# printed. Intended to be invoked from kvm-recheck-rcuperf.sh after +# argument checking. +# +# Usage: kvm-recheck-rcuperf-ftrace.sh resdir +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, you can access it online at +# http://www.gnu.org/licenses/gpl-2.0.html. +# +# Copyright (C) IBM Corporation, 2016 +# +# Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + +i="$1" +. tools/testing/selftests/rcutorture/bin/functions.sh + +if test "`grep -c 'rcu_exp_grace_period.*start' < $i/console.log`" -lt 100 +then + exit 10 +fi + +sed -e 's/^\[[^]]*]//' < $i/console.log | +grep 'us : rcu_exp_grace_period' | +sed -e 's/us : / : /' | +tr -d '\015' | +awk ' +$8 == "start" { + if (starttask != "") + nlost++; + starttask = $1; + starttime = $3; + startseq = $7; +} + +$8 == "end" { + if (starttask == $1 && startseq == $7) { + curgpdur = $3 - starttime; + gptimes[++n] = curgpdur; + gptaskcnt[starttask]++; + sum += curgpdur; + if (curgpdur > 1000) + print "Long GP " starttime "us to " $3 "us (" curgpdur "us)"; + starttask = ""; + } else { + # Lost a message or some such, reset. + starttask = ""; + nlost++; + } +} + +$8 == "done" { + piggybackcnt[$1]++; +} + +END { + newNR = asort(gptimes); + if (newNR <= 0) { + print "No ftrace records found???" + exit 10; + } + pct50 = int(newNR * 50 / 100); + if (pct50 < 1) + pct50 = 1; + pct90 = int(newNR * 90 / 100); + if (pct90 < 1) + pct90 = 1; + pct99 = int(newNR * 99 / 100); + if (pct99 < 1) + pct99 = 1; + div = 10 ** int(log(gptimes[pct90]) / log(10) + .5) / 100; + print "Histogram bucket size: " div; + last = gptimes[1] - 10; + count = 0; + for (i = 1; i <= newNR; i++) { + current = div * int(gptimes[i] / div); + if (last == current) { + count++; + } else { + if (count > 0) + print last, count; + count = 1; + last = current; + } + } + if (count > 0) + print last, count; + print "Distribution of grace periods across tasks:"; + for (i in gptaskcnt) { + print "\t" i, gptaskcnt[i]; + nbatches += gptaskcnt[i]; + } + ngps = nbatches; + print "Distribution of piggybacking across tasks:"; + for (i in piggybackcnt) { + print "\t" i, piggybackcnt[i]; + ngps += piggybackcnt[i]; + } + print "Average grace-period duration: " sum / newNR " microseconds"; + print "Minimum grace-period duration: " gptimes[1]; + print "50th percentile grace-period duration: " gptimes[pct50]; + print "90th percentile grace-period duration: " gptimes[pct90]; + print "99th percentile grace-period duration: " gptimes[pct99]; + print "Maximum grace-period duration: " gptimes[newNR]; + print "Grace periods: " ngps + 0 " Batches: " nbatches + 0 " Ratio: " ngps / nbatches " Lost: " nlost + 0; + print "Computed from ftrace data."; +}' +exit 0 diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh new file mode 100755 index 000000000000..8f3121afc716 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# +# Analyze a given results directory for rcuperf performance measurements. +# +# Usage: kvm-recheck-rcuperf.sh resdir +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, you can access it online at +# http://www.gnu.org/licenses/gpl-2.0.html. +# +# Copyright (C) IBM Corporation, 2016 +# +# Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + +i="$1" +if test -d $i +then + : +else + echo Unreadable results directory: $i + exit 1 +fi +PATH=`pwd`/tools/testing/selftests/rcutorture/bin:$PATH; export PATH +. tools/testing/selftests/rcutorture/bin/functions.sh + +if kvm-recheck-rcuperf-ftrace.sh $i +then + # ftrace data was successfully analyzed, call it good! + exit 0 +fi + +configfile=`echo $i | sed -e 's/^.*\///'` + +sed -e 's/^\[[^]]*]//' < $i/console.log | +awk ' +/-perf: .* gps: .* batches:/ { + ngps = $9; + nbatches = $11; +} + +/-perf: .*writer-duration/ { + gptimes[++n] = $5 / 1000.; + sum += $5 / 1000.; +} + +END { + newNR = asort(gptimes); + if (newNR <= 0) { + print "No rcuperf records found???" + exit; + } + pct50 = int(newNR * 50 / 100); + if (pct50 < 1) + pct50 = 1; + pct90 = int(newNR * 90 / 100); + if (pct90 < 1) + pct90 = 1; + pct99 = int(newNR * 99 / 100); + if (pct99 < 1) + pct99 = 1; + div = 10 ** int(log(gptimes[pct90]) / log(10) + .5) / 100; + print "Histogram bucket size: " div; + last = gptimes[1] - 10; + count = 0; + for (i = 1; i <= newNR; i++) { + current = div * int(gptimes[i] / div); + if (last == current) { + count++; + } else { + if (count > 0) + print last, count; + count = 1; + last = current; + } + } + if (count > 0) + print last, count; + print "Average grace-period duration: " sum / newNR " microseconds"; + print "Minimum grace-period duration: " gptimes[1]; + print "50th percentile grace-period duration: " gptimes[pct50]; + print "90th percentile grace-period duration: " gptimes[pct90]; + print "99th percentile grace-period duration: " gptimes[pct99]; + print "Maximum grace-period duration: " gptimes[newNR]; + print "Grace periods: " ngps + 0 " Batches: " nbatches + 0 " Ratio: " ngps / nbatches; + print "Computed from rcuperf printk output."; +}' diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh index d86bdd6b6cc2..f659346d3358 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh @@ -48,7 +48,10 @@ do cat $i/Make.oldconfig.err fi parse-build.sh $i/Make.out $configfile - parse-torture.sh $i/console.log $configfile + if test "$TORTURE_SUITE" != rcuperf + then + parse-torture.sh $i/console.log $configfile + fi parse-console.sh $i/console.log $configfile if test -r $i/Warnings then diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 0f80eefb0bfd..4109f306d855 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -6,7 +6,7 @@ # Execute this in the source tree. Do not run it as a background task # because qemu does not seem to like that much. # -# Usage: kvm-test-1-run.sh config builddir resdir minutes qemu-args boot_args +# Usage: kvm-test-1-run.sh config builddir resdir seconds qemu-args boot_args # # qemu-args defaults to "-enable-kvm -soundhw pcspk -nographic", along with # arguments specifying the number of CPUs and other @@ -91,25 +91,33 @@ fi # CONFIG_PCMCIA=n # CONFIG_CARDBUS=n # CONFIG_YENTA=n -if kvm-build.sh $config_template $builddir $T +base_resdir=`echo $resdir | sed -e 's/\.[0-9]\+$//'` +if test "$base_resdir" != "$resdir" -a -f $base_resdir/bzImage -a -f $base_resdir/vmlinux then + # Rerunning previous test, so use that test's kernel. + QEMU="`identify_qemu $base_resdir/vmlinux`" + KERNEL=$base_resdir/bzImage + ln -s $base_resdir/Make*.out $resdir # for kvm-recheck.sh + ln -s $base_resdir/.config $resdir # for kvm-recheck.sh +elif kvm-build.sh $config_template $builddir $T +then + # Had to build a kernel for this test. QEMU="`identify_qemu $builddir/vmlinux`" BOOT_IMAGE="`identify_boot_image $QEMU`" cp $builddir/Make*.out $resdir + cp $builddir/vmlinux $resdir cp $builddir/.config $resdir if test -n "$BOOT_IMAGE" then cp $builddir/$BOOT_IMAGE $resdir + KERNEL=$resdir/bzImage else echo No identifiable boot image, not running KVM, see $resdir. echo Do the torture scripts know about your architecture? fi parse-build.sh $resdir/Make.out $title - if test -f $builddir.wait - then - mv $builddir.wait $builddir.ready - fi else + # Build failed. cp $builddir/Make*.out $resdir cp $builddir/.config $resdir || : echo Build failed, not running KVM, see $resdir. @@ -119,12 +127,15 @@ else fi exit 1 fi +if test -f $builddir.wait +then + mv $builddir.wait $builddir.ready +fi while test -f $builddir.ready do sleep 1 done -minutes=$4 -seconds=$(($minutes * 60)) +seconds=$4 qemu_args=$5 boot_args=$6 @@ -167,15 +178,26 @@ then exit 0 fi echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log -echo $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd -( $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append "$qemu_append $boot_args"; echo $? > $resdir/qemu-retval ) & -qemu_pid=$! +echo $QEMU $qemu_args -m 512 -kernel $KERNEL -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd +( $QEMU $qemu_args -m 512 -kernel $KERNEL -append "$qemu_append $boot_args"& echo $! > $resdir/qemu_pid; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) & commandcompleted=0 -echo Monitoring qemu job at pid $qemu_pid +sleep 10 # Give qemu's pid a chance to reach the file +if test -s "$resdir/qemu_pid" +then + qemu_pid=`cat "$resdir/qemu_pid"` + echo Monitoring qemu job at pid $qemu_pid +else + qemu_pid="" + echo Monitoring qemu job at yet-as-unknown pid +fi while : do + if test -z "$qemu_pid" -a -s "$resdir/qemu_pid" + then + qemu_pid=`cat "$resdir/qemu_pid"` + fi kruntime=`awk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null` - if kill -0 $qemu_pid > /dev/null 2>&1 + if test -z "$qemu_pid" || kill -0 "$qemu_pid" > /dev/null 2>&1 then if test $kruntime -ge $seconds then @@ -195,12 +217,16 @@ do ps -fp $killpid >> $resdir/Warnings 2>&1 fi else - echo ' ---' `date`: Kernel done + echo ' ---' `date`: "Kernel done" fi break fi done -if test $commandcompleted -eq 0 +if test -z "$qemu_pid" -a -s "$resdir/qemu_pid" +then + qemu_pid=`cat "$resdir/qemu_pid"` +fi +if test $commandcompleted -eq 0 -a -n "$qemu_pid" then echo Grace period for qemu job at pid $qemu_pid while : @@ -220,6 +246,9 @@ then fi sleep 1 done +elif test -z "$qemu_pid" +then + echo Unknown PID, cannot kill qemu command fi parse-torture.sh $resdir/console.log $title diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 4a431767f77a..0d598145873e 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -34,7 +34,7 @@ T=/tmp/kvm.sh.$$ trap 'rm -rf $T' 0 mkdir $T -dur=30 +dur=$((30*60)) dryrun="" KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM PATH=${KVM}/bin:$PATH; export PATH @@ -48,6 +48,7 @@ resdir="" configs="" cpus=0 ds=`date +%Y.%m.%d-%H:%M:%S` +jitter=0 . functions.sh @@ -63,6 +64,7 @@ usage () { echo " --dryrun sched|script" echo " --duration minutes" echo " --interactive" + echo " --jitter N [ maxsleep (us) [ maxspin (us) ] ]" echo " --kmake-arg kernel-make-arguments" echo " --mac nn:nn:nn:nn:nn:nn" echo " --no-initrd" @@ -116,12 +118,17 @@ do ;; --duration) checkarg --duration "(minutes)" $# "$2" '^[0-9]*$' '^error' - dur=$2 + dur=$(($2*60)) shift ;; --interactive) TORTURE_QEMU_INTERACTIVE=1; export TORTURE_QEMU_INTERACTIVE ;; + --jitter) + checkarg --jitter "(# threads [ sleep [ spin ] ])" $# "$2" '^-\{,1\}[0-9]\+\( \+[0-9]\+\)\{,2\} *$' '^error$' + jitter="$2" + shift + ;; --kmake-arg) checkarg --kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$' TORTURE_KMAKE_ARG="$2" @@ -156,7 +163,7 @@ do shift ;; --torture) - checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\)$' '^--' + checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuperf\)$' '^--' TORTURE_SUITE=$2 shift ;; @@ -299,6 +306,7 @@ awk < $T/cfgcpu.pack \ -v CONFIGDIR="$CONFIGFRAG/" \ -v KVM="$KVM" \ -v ncpus=$cpus \ + -v jitter="$jitter" \ -v rd=$resdir/$ds/ \ -v dur=$dur \ -v TORTURE_QEMU_ARG="$TORTURE_QEMU_ARG" \ @@ -359,6 +367,16 @@ function dump(first, pastlast, batchnum) print "\techo ----", cfr[j], cpusr[j] ovf ": Starting kernel. `date` >> " rd "/log"; print "fi" } + njitter = 0; + split(jitter, ja); + if (ja[1] == -1 && ncpus == 0) + njitter = 1; + else if (ja[1] == -1) + njitter = ncpus; + else + njitter = ja[1]; + for (j = 0; j < njitter; j++) + print "jitter.sh " j " " dur " " ja[2] " " ja[3] "&" print "wait" print "if test -z \"$TORTURE_BUILDONLY\"" print "then" diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 index 39a2c6d7d7ec..17cbe098b115 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 @@ -14,7 +14,7 @@ CONFIG_HOTPLUG_CPU=n CONFIG_SUSPEND=n CONFIG_HIBERNATION=n CONFIG_RCU_FANOUT=4 -CONFIG_RCU_FANOUT_LEAF=4 +CONFIG_RCU_FANOUT_LEAF=3 CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot index 0fc8a3428938..e34c33430447 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot @@ -1 +1 @@ -rcutorture.torture_type=rcu_bh +rcutorture.torture_type=rcu_bh rcutree.rcu_fanout_leaf=4 diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/CFLIST b/tools/testing/selftests/rcutorture/configs/rcuperf/CFLIST new file mode 100644 index 000000000000..c9f56cf20775 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcuperf/CFLIST @@ -0,0 +1 @@ +TREE diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon b/tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon new file mode 100644 index 000000000000..a09816b8c0f3 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon @@ -0,0 +1,2 @@ +CONFIG_RCU_PERF_TEST=y +CONFIG_PRINTK_TIME=y diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE new file mode 100644 index 000000000000..a312f671a29a --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE @@ -0,0 +1,20 @@ +CONFIG_SMP=y +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=y +#CHECK#CONFIG_PREEMPT_RCU=y +CONFIG_HZ_PERIODIC=n +CONFIG_NO_HZ_IDLE=y +CONFIG_NO_HZ_FULL=n +CONFIG_RCU_FAST_NO_HZ=n +CONFIG_RCU_TRACE=n +CONFIG_HOTPLUG_CPU=n +CONFIG_SUSPEND=n +CONFIG_HIBERNATION=n +CONFIG_RCU_NOCB_CPU=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n +CONFIG_RCU_BOOST=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_RCU_EXPERT=y +CONFIG_RCU_TRACE=y diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54 b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54 new file mode 100644 index 000000000000..985fb170d13c --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54 @@ -0,0 +1,23 @@ +CONFIG_SMP=y +CONFIG_NR_CPUS=54 +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=y +#CHECK#CONFIG_PREEMPT_RCU=y +CONFIG_HZ_PERIODIC=n +CONFIG_NO_HZ_IDLE=y +CONFIG_NO_HZ_FULL=n +CONFIG_RCU_FAST_NO_HZ=n +CONFIG_RCU_TRACE=n +CONFIG_HOTPLUG_CPU=n +CONFIG_SUSPEND=n +CONFIG_HIBERNATION=n +CONFIG_RCU_FANOUT=3 +CONFIG_RCU_FANOUT_LEAF=2 +CONFIG_RCU_NOCB_CPU=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n +CONFIG_RCU_BOOST=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_RCU_EXPERT=y +CONFIG_RCU_TRACE=y diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh new file mode 100644 index 000000000000..34f2a1b35ee5 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh @@ -0,0 +1,52 @@ +#!/bin/bash +# +# Torture-suite-dependent shell functions for the rest of the scripts. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, you can access it online at +# http://www.gnu.org/licenses/gpl-2.0.html. +# +# Copyright (C) IBM Corporation, 2015 +# +# Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + +# rcuperf_param_nreaders bootparam-string +# +# Adds nreaders rcuperf module parameter if not already specified. +rcuperf_param_nreaders () { + if ! echo "$1" | grep -q "rcuperf.nreaders" + then + echo rcuperf.nreaders=-1 + fi +} + +# rcuperf_param_nwriters bootparam-string +# +# Adds nwriters rcuperf module parameter if not already specified. +rcuperf_param_nwriters () { + if ! echo "$1" | grep -q "rcuperf.nwriters" + then + echo rcuperf.nwriters=-1 + fi +} + +# per_version_boot_params bootparam-string config-file seconds +# +# Adds per-version torture-module parameters to kernels supporting them. +per_version_boot_params () { + echo $1 `rcuperf_param_nreaders "$1"` \ + `rcuperf_param_nwriters "$1"` \ + rcuperf.perf_runnable=1 \ + rcuperf.shutdown=1 \ + rcuperf.verbose=1 +} diff --git a/tools/testing/selftests/sigaltstack/Makefile b/tools/testing/selftests/sigaltstack/Makefile new file mode 100644 index 000000000000..56af56eda6fa --- /dev/null +++ b/tools/testing/selftests/sigaltstack/Makefile @@ -0,0 +1,8 @@ +CFLAGS = -Wall +BINARIES = sas +all: $(BINARIES) + +include ../lib.mk + +clean: + rm -rf $(BINARIES) diff --git a/tools/testing/selftests/sigaltstack/sas.c b/tools/testing/selftests/sigaltstack/sas.c new file mode 100644 index 000000000000..1bb01258e559 --- /dev/null +++ b/tools/testing/selftests/sigaltstack/sas.c @@ -0,0 +1,176 @@ +/* + * Stas Sergeev <stsp@users.sourceforge.net> + * + * test sigaltstack(SS_ONSTACK | SS_AUTODISARM) + * If that succeeds, then swapcontext() can be used inside sighandler safely. + * + */ + +#define _GNU_SOURCE +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <ucontext.h> +#include <alloca.h> +#include <string.h> +#include <assert.h> +#include <errno.h> + +#ifndef SS_AUTODISARM +#define SS_AUTODISARM (1U << 31) +#endif + +static void *sstack, *ustack; +static ucontext_t uc, sc; +static const char *msg = "[OK]\tStack preserved"; +static const char *msg2 = "[FAIL]\tStack corrupted"; +struct stk_data { + char msg[128]; + int flag; +}; + +void my_usr1(int sig, siginfo_t *si, void *u) +{ + char *aa; + int err; + stack_t stk; + struct stk_data *p; + + register unsigned long sp asm("sp"); + + if (sp < (unsigned long)sstack || + sp >= (unsigned long)sstack + SIGSTKSZ) { + printf("[FAIL]\tSP is not on sigaltstack\n"); + exit(EXIT_FAILURE); + } + /* put some data on stack. other sighandler will try to overwrite it */ + aa = alloca(1024); + assert(aa); + p = (struct stk_data *)(aa + 512); + strcpy(p->msg, msg); + p->flag = 1; + printf("[RUN]\tsignal USR1\n"); + err = sigaltstack(NULL, &stk); + if (err) { + perror("[FAIL]\tsigaltstack()"); + exit(EXIT_FAILURE); + } + if (stk.ss_flags != SS_DISABLE) + printf("[FAIL]\tss_flags=%i, should be SS_DISABLE\n", + stk.ss_flags); + else + printf("[OK]\tsigaltstack is disabled in sighandler\n"); + swapcontext(&sc, &uc); + printf("%s\n", p->msg); + if (!p->flag) { + printf("[RUN]\tAborting\n"); + exit(EXIT_FAILURE); + } +} + +void my_usr2(int sig, siginfo_t *si, void *u) +{ + char *aa; + struct stk_data *p; + + printf("[RUN]\tsignal USR2\n"); + aa = alloca(1024); + /* dont run valgrind on this */ + /* try to find the data stored by previous sighandler */ + p = memmem(aa, 1024, msg, strlen(msg)); + if (p) { + printf("[FAIL]\tsigaltstack re-used\n"); + /* corrupt the data */ + strcpy(p->msg, msg2); + /* tell other sighandler that his data is corrupted */ + p->flag = 0; + } +} + +static void switch_fn(void) +{ + printf("[RUN]\tswitched to user ctx\n"); + raise(SIGUSR2); + setcontext(&sc); +} + +int main(void) +{ + struct sigaction act; + stack_t stk; + int err; + + sigemptyset(&act.sa_mask); + act.sa_flags = SA_ONSTACK | SA_SIGINFO; + act.sa_sigaction = my_usr1; + sigaction(SIGUSR1, &act, NULL); + act.sa_sigaction = my_usr2; + sigaction(SIGUSR2, &act, NULL); + sstack = mmap(NULL, SIGSTKSZ, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0); + if (sstack == MAP_FAILED) { + perror("mmap()"); + return EXIT_FAILURE; + } + + err = sigaltstack(NULL, &stk); + if (err) { + perror("[FAIL]\tsigaltstack()"); + exit(EXIT_FAILURE); + } + if (stk.ss_flags == SS_DISABLE) { + printf("[OK]\tInitial sigaltstack state was SS_DISABLE\n"); + } else { + printf("[FAIL]\tInitial sigaltstack state was %i; should have been SS_DISABLE\n", stk.ss_flags); + return EXIT_FAILURE; + } + + stk.ss_sp = sstack; + stk.ss_size = SIGSTKSZ; + stk.ss_flags = SS_ONSTACK | SS_AUTODISARM; + err = sigaltstack(&stk, NULL); + if (err) { + if (errno == EINVAL) { + printf("[NOTE]\tThe running kernel doesn't support SS_AUTODISARM\n"); + /* + * If test cases for the !SS_AUTODISARM variant were + * added, we could still run them. We don't have any + * test cases like that yet, so just exit and report + * success. + */ + return 0; + } else { + perror("[FAIL]\tsigaltstack(SS_ONSTACK | SS_AUTODISARM)"); + return EXIT_FAILURE; + } + } + + ustack = mmap(NULL, SIGSTKSZ, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0); + if (ustack == MAP_FAILED) { + perror("mmap()"); + return EXIT_FAILURE; + } + getcontext(&uc); + uc.uc_link = NULL; + uc.uc_stack.ss_sp = ustack; + uc.uc_stack.ss_size = SIGSTKSZ; + makecontext(&uc, switch_fn, 0); + raise(SIGUSR1); + + err = sigaltstack(NULL, &stk); + if (err) { + perror("[FAIL]\tsigaltstack()"); + exit(EXIT_FAILURE); + } + if (stk.ss_flags != SS_AUTODISARM) { + printf("[FAIL]\tss_flags=%i, should be SS_AUTODISARM\n", + stk.ss_flags); + exit(EXIT_FAILURE); + } + printf("[OK]\tsigaltstack is still SS_AUTODISARM after signal\n"); + + printf("[OK]\tTest passed\n"); + return 0; +} diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index b47ebd170690..c73425de3cfe 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -9,6 +9,7 @@ TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_sysc TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ vdso_restorer +TARGETS_C_64BIT_ONLY := fsgsbase TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY) TARGETS_C_64BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_64BIT_ONLY) diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c new file mode 100644 index 000000000000..5b2b4b3c634c --- /dev/null +++ b/tools/testing/selftests/x86/fsgsbase.c @@ -0,0 +1,398 @@ +/* + * fsgsbase.c, an fsgsbase test + * Copyright (c) 2014-2016 Andy Lutomirski + * GPL v2 + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <sys/syscall.h> +#include <unistd.h> +#include <err.h> +#include <sys/user.h> +#include <asm/prctl.h> +#include <sys/prctl.h> +#include <signal.h> +#include <limits.h> +#include <sys/ucontext.h> +#include <sched.h> +#include <linux/futex.h> +#include <pthread.h> +#include <asm/ldt.h> +#include <sys/mman.h> + +#ifndef __x86_64__ +# error This test is 64-bit only +#endif + +static volatile sig_atomic_t want_segv; +static volatile unsigned long segv_addr; + +static int nerrs; + +static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), + int flags) +{ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = handler; + sa.sa_flags = SA_SIGINFO | flags; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); +} + +static void clearhandler(int sig) +{ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = SIG_DFL; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); +} + +static void sigsegv(int sig, siginfo_t *si, void *ctx_void) +{ + ucontext_t *ctx = (ucontext_t*)ctx_void; + + if (!want_segv) { + clearhandler(SIGSEGV); + return; /* Crash cleanly. */ + } + + want_segv = false; + segv_addr = (unsigned long)si->si_addr; + + ctx->uc_mcontext.gregs[REG_RIP] += 4; /* Skip the faulting mov */ + +} + +enum which_base { FS, GS }; + +static unsigned long read_base(enum which_base which) +{ + unsigned long offset; + /* + * Unless we have FSGSBASE, there's no direct way to do this from + * user mode. We can get at it indirectly using signals, though. + */ + + want_segv = true; + + offset = 0; + if (which == FS) { + /* Use a constant-length instruction here. */ + asm volatile ("mov %%fs:(%%rcx), %%rax" : : "c" (offset) : "rax"); + } else { + asm volatile ("mov %%gs:(%%rcx), %%rax" : : "c" (offset) : "rax"); + } + if (!want_segv) + return segv_addr + offset; + + /* + * If that didn't segfault, try the other end of the address space. + * Unless we get really unlucky and run into the vsyscall page, this + * is guaranteed to segfault. + */ + + offset = (ULONG_MAX >> 1) + 1; + if (which == FS) { + asm volatile ("mov %%fs:(%%rcx), %%rax" + : : "c" (offset) : "rax"); + } else { + asm volatile ("mov %%gs:(%%rcx), %%rax" + : : "c" (offset) : "rax"); + } + if (!want_segv) + return segv_addr + offset; + + abort(); +} + +static void check_gs_value(unsigned long value) +{ + unsigned long base; + unsigned short sel; + + printf("[RUN]\tARCH_SET_GS to 0x%lx\n", value); + if (syscall(SYS_arch_prctl, ARCH_SET_GS, value) != 0) + err(1, "ARCH_SET_GS"); + + asm volatile ("mov %%gs, %0" : "=rm" (sel)); + base = read_base(GS); + if (base == value) { + printf("[OK]\tGSBASE was set as expected (selector 0x%hx)\n", + sel); + } else { + nerrs++; + printf("[FAIL]\tGSBASE was not as expected: got 0x%lx (selector 0x%hx)\n", + base, sel); + } + + if (syscall(SYS_arch_prctl, ARCH_GET_GS, &base) != 0) + err(1, "ARCH_GET_GS"); + if (base == value) { + printf("[OK]\tARCH_GET_GS worked as expected (selector 0x%hx)\n", + sel); + } else { + nerrs++; + printf("[FAIL]\tARCH_GET_GS was not as expected: got 0x%lx (selector 0x%hx)\n", + base, sel); + } +} + +static void mov_0_gs(unsigned long initial_base, bool schedule) +{ + unsigned long base, arch_base; + + printf("[RUN]\tARCH_SET_GS to 0x%lx then mov 0 to %%gs%s\n", initial_base, schedule ? " and schedule " : ""); + if (syscall(SYS_arch_prctl, ARCH_SET_GS, initial_base) != 0) + err(1, "ARCH_SET_GS"); + + if (schedule) + usleep(10); + + asm volatile ("mov %0, %%gs" : : "rm" (0)); + base = read_base(GS); + if (syscall(SYS_arch_prctl, ARCH_GET_GS, &arch_base) != 0) + err(1, "ARCH_GET_GS"); + if (base == arch_base) { + printf("[OK]\tGSBASE is 0x%lx\n", base); + } else { + nerrs++; + printf("[FAIL]\tGSBASE changed to 0x%lx but kernel reports 0x%lx\n", base, arch_base); + } +} + +static volatile unsigned long remote_base; +static volatile bool remote_hard_zero; +static volatile unsigned int ftx; + +/* + * ARCH_SET_FS/GS(0) may or may not program a selector of zero. HARD_ZERO + * means to force the selector to zero to improve test coverage. + */ +#define HARD_ZERO 0xa1fa5f343cb85fa4 + +static void do_remote_base() +{ + unsigned long to_set = remote_base; + bool hard_zero = false; + if (to_set == HARD_ZERO) { + to_set = 0; + hard_zero = true; + } + + if (syscall(SYS_arch_prctl, ARCH_SET_GS, to_set) != 0) + err(1, "ARCH_SET_GS"); + + if (hard_zero) + asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0)); + + unsigned short sel; + asm volatile ("mov %%gs, %0" : "=rm" (sel)); + printf("\tother thread: ARCH_SET_GS(0x%lx)%s -- sel is 0x%hx\n", + to_set, hard_zero ? " and clear gs" : "", sel); +} + +void do_unexpected_base(void) +{ + /* + * The goal here is to try to arrange for GS == 0, GSBASE != + * 0, and for the the kernel the think that GSBASE == 0. + * + * To make the test as reliable as possible, this uses + * explicit descriptorss. (This is not the only way. This + * could use ARCH_SET_GS with a low, nonzero base, but the + * relevant side effect of ARCH_SET_GS could change.) + */ + + /* Step 1: tell the kernel that we have GSBASE == 0. */ + if (syscall(SYS_arch_prctl, ARCH_SET_GS, 0) != 0) + err(1, "ARCH_SET_GS"); + + /* Step 2: change GSBASE without telling the kernel. */ + struct user_desc desc = { + .entry_number = 0, + .base_addr = 0xBAADF00D, + .limit = 0xfffff, + .seg_32bit = 1, + .contents = 0, /* Data, grow-up */ + .read_exec_only = 0, + .limit_in_pages = 1, + .seg_not_present = 0, + .useable = 0 + }; + if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) == 0) { + printf("\tother thread: using LDT slot 0\n"); + asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0x7)); + } else { + /* No modify_ldt for us (configured out, perhaps) */ + + struct user_desc *low_desc = mmap( + NULL, sizeof(desc), + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_32BIT, -1, 0); + memcpy(low_desc, &desc, sizeof(desc)); + + low_desc->entry_number = -1; + + /* 32-bit set_thread_area */ + long ret; + asm volatile ("int $0x80" + : "=a" (ret) : "a" (243), "b" (low_desc) + : "flags"); + memcpy(&desc, low_desc, sizeof(desc)); + munmap(low_desc, sizeof(desc)); + + if (ret != 0) { + printf("[NOTE]\tcould not create a segment -- test won't do anything\n"); + return; + } + printf("\tother thread: using GDT slot %d\n", desc.entry_number); + asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)((desc.entry_number << 3) | 0x3))); + } + + /* + * Step 3: set the selector back to zero. On AMD chips, this will + * preserve GSBASE. + */ + + asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0)); +} + +static void *threadproc(void *ctx) +{ + while (1) { + while (ftx == 0) + syscall(SYS_futex, &ftx, FUTEX_WAIT, 0, NULL, NULL, 0); + if (ftx == 3) + return NULL; + + if (ftx == 1) + do_remote_base(); + else if (ftx == 2) + do_unexpected_base(); + else + errx(1, "helper thread got bad command"); + + ftx = 0; + syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0); + } +} + +static void set_gs_and_switch_to(unsigned long local, unsigned long remote) +{ + unsigned long base; + + bool hard_zero = false; + if (local == HARD_ZERO) { + hard_zero = true; + local = 0; + } + + printf("[RUN]\tARCH_SET_GS(0x%lx)%s, then schedule to 0x%lx\n", + local, hard_zero ? " and clear gs" : "", remote); + if (syscall(SYS_arch_prctl, ARCH_SET_GS, local) != 0) + err(1, "ARCH_SET_GS"); + if (hard_zero) + asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0)); + + if (read_base(GS) != local) { + nerrs++; + printf("[FAIL]\tGSBASE wasn't set as expected\n"); + } + + remote_base = remote; + ftx = 1; + syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0); + while (ftx != 0) + syscall(SYS_futex, &ftx, FUTEX_WAIT, 1, NULL, NULL, 0); + + base = read_base(GS); + if (base == local) { + printf("[OK]\tGSBASE remained 0x%lx\n", local); + } else { + nerrs++; + printf("[FAIL]\tGSBASE changed to 0x%lx\n", base); + } +} + +static void test_unexpected_base(void) +{ + unsigned long base; + + printf("[RUN]\tARCH_SET_GS(0), clear gs, then manipulate GSBASE in a different thread\n"); + if (syscall(SYS_arch_prctl, ARCH_SET_GS, 0) != 0) + err(1, "ARCH_SET_GS"); + asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0)); + + ftx = 2; + syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0); + while (ftx != 0) + syscall(SYS_futex, &ftx, FUTEX_WAIT, 1, NULL, NULL, 0); + + base = read_base(GS); + if (base == 0) { + printf("[OK]\tGSBASE remained 0\n"); + } else { + nerrs++; + printf("[FAIL]\tGSBASE changed to 0x%lx\n", base); + } +} + +int main() +{ + pthread_t thread; + + sethandler(SIGSEGV, sigsegv, 0); + + check_gs_value(0); + check_gs_value(1); + check_gs_value(0x200000000); + check_gs_value(0); + check_gs_value(0x200000000); + check_gs_value(1); + + for (int sched = 0; sched < 2; sched++) { + mov_0_gs(0, !!sched); + mov_0_gs(1, !!sched); + mov_0_gs(0x200000000, !!sched); + } + + /* Set up for multithreading. */ + + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) + err(1, "sched_setaffinity to CPU 0"); /* should never fail */ + + if (pthread_create(&thread, 0, threadproc, 0) != 0) + err(1, "pthread_create"); + + static unsigned long bases_with_hard_zero[] = { + 0, HARD_ZERO, 1, 0x200000000, + }; + + for (int local = 0; local < 4; local++) { + for (int remote = 0; remote < 4; remote++) { + set_gs_and_switch_to(bases_with_hard_zero[local], + bases_with_hard_zero[remote]); + } + } + + test_unexpected_base(); + + ftx = 3; /* Kill the thread. */ + syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0); + + if (pthread_join(thread, NULL) != 0) + err(1, "pthread_join"); + + return nerrs == 0 ? 0 : 1; +} diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c index 31a3035cd4eb..4af47079cf04 100644 --- a/tools/testing/selftests/x86/ldt_gdt.c +++ b/tools/testing/selftests/x86/ldt_gdt.c @@ -21,6 +21,9 @@ #include <pthread.h> #include <sched.h> #include <linux/futex.h> +#include <sys/mman.h> +#include <asm/prctl.h> +#include <sys/prctl.h> #define AR_ACCESSED (1<<8) @@ -44,6 +47,12 @@ static int nerrs; +/* Points to an array of 1024 ints, each holding its own index. */ +static const unsigned int *counter_page; +static struct user_desc *low_user_desc; +static struct user_desc *low_user_desc_clear; /* Use to delete GDT entry */ +static int gdt_entry_num; + static void check_invalid_segment(uint16_t index, int ldt) { uint32_t has_limit = 0, has_ar = 0, limit, ar; @@ -561,16 +570,257 @@ static void do_exec_test(void) } } +static void setup_counter_page(void) +{ + unsigned int *page = mmap(NULL, 4096, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_32BIT, -1, 0); + if (page == MAP_FAILED) + err(1, "mmap"); + + for (int i = 0; i < 1024; i++) + page[i] = i; + counter_page = page; +} + +static int invoke_set_thread_area(void) +{ + int ret; + asm volatile ("int $0x80" + : "=a" (ret), "+m" (low_user_desc) : + "a" (243), "b" (low_user_desc) + : "flags"); + return ret; +} + +static void setup_low_user_desc(void) +{ + low_user_desc = mmap(NULL, 2 * sizeof(struct user_desc), + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_32BIT, -1, 0); + if (low_user_desc == MAP_FAILED) + err(1, "mmap"); + + low_user_desc->entry_number = -1; + low_user_desc->base_addr = (unsigned long)&counter_page[1]; + low_user_desc->limit = 0xfffff; + low_user_desc->seg_32bit = 1; + low_user_desc->contents = 0; /* Data, grow-up*/ + low_user_desc->read_exec_only = 0; + low_user_desc->limit_in_pages = 1; + low_user_desc->seg_not_present = 0; + low_user_desc->useable = 0; + + if (invoke_set_thread_area() == 0) { + gdt_entry_num = low_user_desc->entry_number; + printf("[NOTE]\tset_thread_area is available; will use GDT index %d\n", gdt_entry_num); + } else { + printf("[NOTE]\tset_thread_area is unavailable\n"); + } + + low_user_desc_clear = low_user_desc + 1; + low_user_desc_clear->entry_number = gdt_entry_num; + low_user_desc_clear->read_exec_only = 1; + low_user_desc_clear->seg_not_present = 1; +} + +static void test_gdt_invalidation(void) +{ + if (!gdt_entry_num) + return; /* 64-bit only system -- we can't use set_thread_area */ + + unsigned short prev_sel; + unsigned short sel; + unsigned int eax; + const char *result; +#ifdef __x86_64__ + unsigned long saved_base; + unsigned long new_base; +#endif + + /* Test DS */ + invoke_set_thread_area(); + eax = 243; + sel = (gdt_entry_num << 3) | 3; + asm volatile ("movw %%ds, %[prev_sel]\n\t" + "movw %[sel], %%ds\n\t" +#ifdef __i386__ + "pushl %%ebx\n\t" +#endif + "movl %[arg1], %%ebx\n\t" + "int $0x80\n\t" /* Should invalidate ds */ +#ifdef __i386__ + "popl %%ebx\n\t" +#endif + "movw %%ds, %[sel]\n\t" + "movw %[prev_sel], %%ds" + : [prev_sel] "=&r" (prev_sel), [sel] "+r" (sel), + "+a" (eax) + : "m" (low_user_desc_clear), + [arg1] "r" ((unsigned int)(unsigned long)low_user_desc_clear) + : "flags"); + + if (sel != 0) { + result = "FAIL"; + nerrs++; + } else { + result = "OK"; + } + printf("[%s]\tInvalidate DS with set_thread_area: new DS = 0x%hx\n", + result, sel); + + /* Test ES */ + invoke_set_thread_area(); + eax = 243; + sel = (gdt_entry_num << 3) | 3; + asm volatile ("movw %%es, %[prev_sel]\n\t" + "movw %[sel], %%es\n\t" +#ifdef __i386__ + "pushl %%ebx\n\t" +#endif + "movl %[arg1], %%ebx\n\t" + "int $0x80\n\t" /* Should invalidate es */ +#ifdef __i386__ + "popl %%ebx\n\t" +#endif + "movw %%es, %[sel]\n\t" + "movw %[prev_sel], %%es" + : [prev_sel] "=&r" (prev_sel), [sel] "+r" (sel), + "+a" (eax) + : "m" (low_user_desc_clear), + [arg1] "r" ((unsigned int)(unsigned long)low_user_desc_clear) + : "flags"); + + if (sel != 0) { + result = "FAIL"; + nerrs++; + } else { + result = "OK"; + } + printf("[%s]\tInvalidate ES with set_thread_area: new ES = 0x%hx\n", + result, sel); + + /* Test FS */ + invoke_set_thread_area(); + eax = 243; + sel = (gdt_entry_num << 3) | 3; +#ifdef __x86_64__ + syscall(SYS_arch_prctl, ARCH_GET_FS, &saved_base); +#endif + asm volatile ("movw %%fs, %[prev_sel]\n\t" + "movw %[sel], %%fs\n\t" +#ifdef __i386__ + "pushl %%ebx\n\t" +#endif + "movl %[arg1], %%ebx\n\t" + "int $0x80\n\t" /* Should invalidate fs */ +#ifdef __i386__ + "popl %%ebx\n\t" +#endif + "movw %%fs, %[sel]\n\t" + : [prev_sel] "=&r" (prev_sel), [sel] "+r" (sel), + "+a" (eax) + : "m" (low_user_desc_clear), + [arg1] "r" ((unsigned int)(unsigned long)low_user_desc_clear) + : "flags"); + +#ifdef __x86_64__ + syscall(SYS_arch_prctl, ARCH_GET_FS, &new_base); +#endif + + /* Restore FS/BASE for glibc */ + asm volatile ("movw %[prev_sel], %%fs" : : [prev_sel] "rm" (prev_sel)); +#ifdef __x86_64__ + if (saved_base) + syscall(SYS_arch_prctl, ARCH_SET_FS, saved_base); +#endif + + if (sel != 0) { + result = "FAIL"; + nerrs++; + } else { + result = "OK"; + } + printf("[%s]\tInvalidate FS with set_thread_area: new FS = 0x%hx\n", + result, sel); + +#ifdef __x86_64__ + if (sel == 0 && new_base != 0) { + nerrs++; + printf("[FAIL]\tNew FSBASE was 0x%lx\n", new_base); + } else { + printf("[OK]\tNew FSBASE was zero\n"); + } +#endif + + /* Test GS */ + invoke_set_thread_area(); + eax = 243; + sel = (gdt_entry_num << 3) | 3; +#ifdef __x86_64__ + syscall(SYS_arch_prctl, ARCH_GET_GS, &saved_base); +#endif + asm volatile ("movw %%gs, %[prev_sel]\n\t" + "movw %[sel], %%gs\n\t" +#ifdef __i386__ + "pushl %%ebx\n\t" +#endif + "movl %[arg1], %%ebx\n\t" + "int $0x80\n\t" /* Should invalidate gs */ +#ifdef __i386__ + "popl %%ebx\n\t" +#endif + "movw %%gs, %[sel]\n\t" + : [prev_sel] "=&r" (prev_sel), [sel] "+r" (sel), + "+a" (eax) + : "m" (low_user_desc_clear), + [arg1] "r" ((unsigned int)(unsigned long)low_user_desc_clear) + : "flags"); + +#ifdef __x86_64__ + syscall(SYS_arch_prctl, ARCH_GET_GS, &new_base); +#endif + + /* Restore GS/BASE for glibc */ + asm volatile ("movw %[prev_sel], %%gs" : : [prev_sel] "rm" (prev_sel)); +#ifdef __x86_64__ + if (saved_base) + syscall(SYS_arch_prctl, ARCH_SET_GS, saved_base); +#endif + + if (sel != 0) { + result = "FAIL"; + nerrs++; + } else { + result = "OK"; + } + printf("[%s]\tInvalidate GS with set_thread_area: new GS = 0x%hx\n", + result, sel); + +#ifdef __x86_64__ + if (sel == 0 && new_base != 0) { + nerrs++; + printf("[FAIL]\tNew GSBASE was 0x%lx\n", new_base); + } else { + printf("[OK]\tNew GSBASE was zero\n"); + } +#endif +} + int main(int argc, char **argv) { if (argc == 1 && !strcmp(argv[0], "ldt_gdt_test_exec")) return finish_exec_test(); + setup_counter_page(); + setup_low_user_desc(); + do_simple_tests(); do_multicpu_tests(); do_exec_test(); + test_gdt_invalidation(); + return nerrs ? 1 : 0; } |