diff options
243 files changed, 14277 insertions, 6002 deletions
diff --git a/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCU.svg b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCU.svg new file mode 100644 index 000000000000..727e270b11e4 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCU.svg @@ -0,0 +1,474 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Creator: fig2dev Version 3.2 Patchlevel 5e --> + +<!-- CreationDate: Wed Dec 9 17:28:20 2015 --> + +<!-- Magnification: 3.000 --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="9.1in" + height="8.9in" + viewBox="-66 -66 10932 10707" + id="svg2" + version="1.1" + inkscape:version="0.48.4 r9939" + sodipodi:docname="BigTreeClassicRCU.fig"> + <metadata + id="metadata106"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title></dc:title> + </cc:Work> + </rdf:RDF> + </metadata> + <defs + id="defs104"> + <marker + inkscape:stockid="Arrow1Mend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow1Mend" + style="overflow:visible;"> + <path + id="path3864" + d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z " + style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;" + transform="scale(0.4) rotate(180) translate(10,0)" /> + </marker> + </defs> + <sodipodi:namedview + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1" + objecttolerance="10" + gridtolerance="10" + guidetolerance="10" + inkscape:pageopacity="0" + inkscape:pageshadow="2" + inkscape:window-width="973" + inkscape:window-height="1137" + id="namedview102" + showgrid="false" + inkscape:zoom="0.9743589" + inkscape:cx="409.50003" + inkscape:cy="400.49997" + inkscape:window-x="915" + inkscape:window-y="24" + inkscape:window-maximized="0" + inkscape:current-layer="g4" /> + <g + style="stroke-width:.025in; fill:none" + id="g4"> + <!-- Line: box --> + <rect + x="0" + y="0" + width="10800" + height="5625" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect6" /> + <!-- Line: box --> + <rect + x="1125" + y="3600" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect8" /> + <!-- Line: box --> + <rect + x="3825" + y="900" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect10" /> + <!-- Line: box --> + <rect + x="6525" + y="3600" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect12" /> + <!-- Line --> + <polyline + points="3375,6525 3375,5046 " + style="stroke:#00d1d1;stroke-width:44.9934641;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline14" /> + <!-- Arrowhead on XXXpoint 3375 6525 - 3375 4860--> + <!-- Circle --> + <circle + cx="7425" + cy="6075" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle18" /> + <!-- Circle --> + <circle + cx="7875" + cy="6075" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle20" /> + <!-- Circle --> + <circle + cx="8325" + cy="6075" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle22" /> + <!-- Circle --> + <circle + cx="2025" + cy="6075" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle24" /> + <!-- Circle --> + <circle + cx="2475" + cy="6075" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle26" /> + <!-- Circle --> + <circle + cx="2925" + cy="6075" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle28" /> + <!-- Circle --> + <circle + cx="4725" + cy="4275" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle30" /> + <!-- Circle --> + <circle + cx="5175" + cy="4275" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle32" /> + <!-- Circle --> + <circle + cx="5625" + cy="4275" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle34" /> + <!-- Line: box --> + <rect + x="2025" + y="6525" + width="2700" + height="1800" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect36" /> + <!-- Line --> + <polyline + points="2475,3600 3975,2310 " + style="stroke:#00d1d1;stroke-width:44.9934641;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline38" /> + <!-- Arrowhead on XXXpoint 2475 3600 - 4116 2190--> + <!-- Line --> + <polyline + points="7875,3600 6372,2310 " + style="stroke:#00d1d1;stroke-width:44.9934641;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline42" /> + <!-- Arrowhead on XXXpoint 7875 3600 - 6231 2190--> + <!-- Line --> + <polyline + points="6975,8775 6975,5046 " + style="stroke:#00d1d1;stroke-width:44.9934641;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline46" /> + <!-- Arrowhead on XXXpoint 6975 8775 - 6975 4860--> + <!-- Line --> + <polyline + points="1575,8775 1575,5046 " + style="stroke:#00d1d1;stroke-width:44.9934641;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline50" /> + <!-- Arrowhead on XXXpoint 1575 8775 - 1575 4860--> + <!-- Line --> + <polyline + points="8775,6525 8775,5046 " + style="stroke:#00d1d1;stroke-width:44.9934641;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline54" /> + <!-- Arrowhead on XXXpoint 8775 6525 - 8775 4860--> + <!-- Text --> + <text + xml:space="preserve" + x="1575" + y="9225" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text58">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1575" + y="9675" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text60">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="1575" + y="10350" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text62">CPU 0</text> + <!-- Text --> + <text + xml:space="preserve" + x="3375" + y="6975" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text64">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="3375" + y="7425" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text66">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="3375" + y="8100" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text68">CPU 15</text> + <!-- Text --> + <text + xml:space="preserve" + x="6975" + y="9225" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text70">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="6975" + y="9675" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text72">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="6975" + y="10350" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text74">CPU 1007</text> + <!-- Text --> + <text + xml:space="preserve" + x="8730" + y="6930" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text76">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="8730" + y="7380" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text78">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="8730" + y="8055" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text80">CPU 1023</text> + <!-- Text --> + <text + xml:space="preserve" + x="225" + y="450" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="start" + id="text82">struct rcu_state</text> + <!-- Text --> + <text + xml:space="preserve" + x="2475" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text84">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2475" + y="4500" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text86">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="7875" + y="4500" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text88">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="7875" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text90">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5175" + y="1350" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text92">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5175" + y="1800" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text94">rcu_node</text> + <!-- Line: box --> + <rect + x="225" + y="8775" + width="2700" + height="1800" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect96" /> + <!-- Line: box --> + <rect + x="5625" + y="8775" + width="2700" + height="1800" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect98" /> + <!-- Line: box --> + <rect + x="7380" + y="6480" + width="2700" + height="1800" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect100" /> + </g> +</svg> diff --git a/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg new file mode 100644 index 000000000000..9bbb1944f962 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg @@ -0,0 +1,499 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Creator: fig2dev Version 3.2 Patchlevel 5e --> + +<!-- CreationDate: Wed Dec 9 17:26:09 2015 --> + +<!-- Magnification: 2.000 --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="5.7in" + height="6.6in" + viewBox="-44 -44 6838 7888" + id="svg2" + version="1.1" + inkscape:version="0.48.4 r9939" + sodipodi:docname="BigTreeClassicRCUBH.fig"> + <metadata + id="metadata110"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title></dc:title> + </cc:Work> + </rdf:RDF> + </metadata> + <defs + id="defs108"> + <marker + inkscape:stockid="Arrow1Mend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow1Mend" + style="overflow:visible;"> + <path + id="path3868" + d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z " + style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;" + transform="scale(0.4) rotate(180) translate(10,0)" /> + </marker> + <marker + inkscape:stockid="Arrow2Mend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow2Mend" + style="overflow:visible;"> + <path + id="path3886" + style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;" + d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z " + transform="scale(0.6) rotate(180) translate(0,0)" /> + </marker> + </defs> + <sodipodi:namedview + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1" + objecttolerance="10" + gridtolerance="10" + guidetolerance="10" + inkscape:pageopacity="0" + inkscape:pageshadow="2" + inkscape:window-width="878" + inkscape:window-height="1148" + id="namedview106" + showgrid="false" + inkscape:zoom="1.3547758" + inkscape:cx="256.5" + inkscape:cy="297" + inkscape:window-x="45" + inkscape:window-y="24" + inkscape:window-maximized="0" + inkscape:current-layer="g4" /> + <g + style="stroke-width:.025in; fill:none" + id="g4"> + <!-- Line: box --> + <rect + x="450" + y="0" + width="6300" + height="7350" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect6" /> + <!-- Line: box --> + <rect + x="4950" + y="4950" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect8" /> + <!-- Line: box --> + <rect + x="750" + y="600" + width="5700" + height="3750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect10" /> + <!-- Line: box --> + <rect + x="0" + y="450" + width="6300" + height="7350" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect12" /> + <!-- Line: box --> + <rect + x="300" + y="1050" + width="5700" + height="3750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect14" /> + <!-- Circle --> + <circle + cx="2850" + cy="3900" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle16" /> + <!-- Circle --> + <circle + cx="3150" + cy="3900" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle18" /> + <!-- Circle --> + <circle + cx="3450" + cy="3900" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle20" /> + <!-- Circle --> + <circle + cx="1350" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle22" /> + <!-- Circle --> + <circle + cx="1650" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle24" /> + <!-- Circle --> + <circle + cx="1950" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle26" /> + <!-- Circle --> + <circle + cx="4350" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle28" /> + <!-- Circle --> + <circle + cx="4650" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle30" /> + <!-- Circle --> + <circle + cx="4950" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle32" /> + <!-- Line --> + <polyline + points="1350,3450 2350,2590 " + style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline34" /> + <!-- Arrowhead on XXXpoint 1350 3450 - 2444 2510--> + <!-- Line --> + <polyline + points="4950,3450 3948,2590 " + style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline38" /> + <!-- Arrowhead on XXXpoint 4950 3450 - 3854 2510--> + <!-- Line: box --> + <rect + x="750" + y="3450" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect42" /> + <!-- Line --> + <polyline + points="2250,5400 2250,4414 " + style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline44" /> + <!-- Arrowhead on XXXpoint 2250 5400 - 2250 4290--> + <!-- Line: box --> + <rect + x="1500" + y="5400" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect48" /> + <!-- Line: box --> + <rect + x="300" + y="6600" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect50" /> + <!-- Line: box --> + <rect + x="3750" + y="3450" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect52" /> + <!-- Line: box --> + <rect + x="4500" + y="5400" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect54" /> + <!-- Line: box --> + <rect + x="3300" + y="6600" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect56" /> + <!-- Line: box --> + <rect + x="2250" + y="1650" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect58" /> + <!-- Text --> + <text + xml:space="preserve" + x="6450" + y="300" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="192" + text-anchor="end" + id="text60">rcu_bh</text> + <!-- Text --> + <text + xml:space="preserve" + x="3150" + y="1950" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text62">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="3150" + y="2250" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text64">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="1650" + y="3750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text66">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1650" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text68">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="4650" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text70">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="4650" + y="3750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text72">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2250" + y="5700" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text74">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2250" + y="6000" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text76">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="6900" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text78">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="7200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text80">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="5250" + y="5700" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text82">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5250" + y="6000" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text84">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="6900" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text86">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="7200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text88">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="450" + y="1350" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="start" + id="text90">struct rcu_state</text> + <!-- Text --> + <text + xml:space="preserve" + x="6000" + y="750" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="192" + text-anchor="end" + id="text92">rcu_sched</text> + <!-- Line --> + <polyline + points="5250,5400 5250,4414 " + style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline94" /> + <!-- Arrowhead on XXXpoint 5250 5400 - 5250 4290--> + <!-- Line --> + <polyline + points="4050,6600 4050,4414 " + style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline98" /> + <!-- Arrowhead on XXXpoint 4050 6600 - 4050 4290--> + <!-- Line --> + <polyline + points="1050,6600 1050,4414 " + style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline102" /> + <!-- Arrowhead on XXXpoint 1050 6600 - 1050 4290--> + </g> +</svg> diff --git a/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg new file mode 100644 index 000000000000..21ba7823479d --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg @@ -0,0 +1,695 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Creator: fig2dev Version 3.2 Patchlevel 5e --> + +<!-- CreationDate: Wed Dec 9 17:20:02 2015 --> + +<!-- Magnification: 2.000 --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="5.7in" + height="8.6in" + viewBox="-44 -44 6838 10288" + id="svg2" + version="1.1" + inkscape:version="0.48.4 r9939" + sodipodi:docname="BigTreeClassicRCUBHdyntick.fig"> + <metadata + id="metadata166"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title></dc:title> + </cc:Work> + </rdf:RDF> + </metadata> + <defs + id="defs164"> + <marker + inkscape:stockid="Arrow1Mend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow1Mend" + style="overflow:visible;"> + <path + id="path3924" + d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z " + style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;" + transform="scale(0.4) rotate(180) translate(10,0)" /> + </marker> + <marker + inkscape:stockid="Arrow2Lend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow2Lend" + style="overflow:visible;"> + <path + id="path3936" + style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;" + d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z " + transform="scale(1.1) rotate(180) translate(1,0)" /> + </marker> + </defs> + <sodipodi:namedview + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1" + objecttolerance="10" + gridtolerance="10" + guidetolerance="10" + inkscape:pageopacity="0" + inkscape:pageshadow="2" + inkscape:window-width="845" + inkscape:window-height="988" + id="namedview162" + showgrid="false" + inkscape:zoom="1.0452196" + inkscape:cx="256.5" + inkscape:cy="387.00003" + inkscape:window-x="356" + inkscape:window-y="61" + inkscape:window-maximized="0" + inkscape:current-layer="g4" /> + <g + style="stroke-width:.025in; fill:none" + id="g4"> + <!-- Line: box --> + <rect + x="450" + y="0" + width="6300" + height="7350" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect6" /> + <!-- Line: box --> + <rect + x="4950" + y="4950" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect8" /> + <!-- Line: box --> + <rect + x="750" + y="600" + width="5700" + height="3750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect10" /> + <!-- Line --> + <polyline + points="5250,8100 5688,5912 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline12" /> + <!-- Arrowhead on XXXpoint 5250 8100 - 5710 5790--> + <polyline + points="5714 6068 5704 5822 5598 6044 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline14" /> + <!-- Line --> + <polyline + points="4050,9300 4486,7262 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline16" /> + <!-- Arrowhead on XXXpoint 4050 9300 - 4512 7140--> + <polyline + points="4514 7418 4506 7172 4396 7394 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline18" /> + <!-- Line --> + <polyline + points="1040,9300 1476,7262 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline20" /> + <!-- Arrowhead on XXXpoint 1040 9300 - 1502 7140--> + <polyline + points="1504 7418 1496 7172 1386 7394 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline22" /> + <!-- Line --> + <polyline + points="2240,8100 2676,6062 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline24" /> + <!-- Arrowhead on XXXpoint 2240 8100 - 2702 5940--> + <polyline + points="2704 6218 2696 5972 2586 6194 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline26" /> + <!-- Line: box --> + <rect + x="0" + y="450" + width="6300" + height="7350" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect28" /> + <!-- Line: box --> + <rect + x="300" + y="1050" + width="5700" + height="3750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect30" /> + <!-- Line --> + <polyline + points="1350,3450 2350,2590 " + style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline32" /> + <!-- Arrowhead on XXXpoint 1350 3450 - 2444 2510--> + <!-- Line --> + <polyline + points="4950,3450 3948,2590 " + style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline36" /> + <!-- Arrowhead on XXXpoint 4950 3450 - 3854 2510--> + <!-- Line --> + <polyline + points="4050,6600 4050,4414 " + style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline40" /> + <!-- Arrowhead on XXXpoint 4050 6600 - 4050 4290--> + <!-- Line --> + <polyline + points="1050,6600 1050,4414 " + style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline44" /> + <!-- Arrowhead on XXXpoint 1050 6600 - 1050 4290--> + <!-- Line --> + <polyline + points="2250,5400 2250,4414 " + style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline48" /> + <!-- Arrowhead on XXXpoint 2250 5400 - 2250 4290--> + <!-- Line --> + <polyline + points="2250,8100 2250,6364 " + style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)" + id="polyline52" /> + <!-- Arrowhead on XXXpoint 2250 8100 - 2250 6240--> + <!-- Line --> + <polyline + points="1050,9300 1050,7564 " + style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)" + id="polyline56" /> + <!-- Arrowhead on XXXpoint 1050 9300 - 1050 7440--> + <!-- Line --> + <polyline + points="4050,9300 4050,7564 " + style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)" + id="polyline60" /> + <!-- Arrowhead on XXXpoint 4050 9300 - 4050 7440--> + <!-- Line --> + <polyline + points="5250,8100 5250,6364 " + style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)" + id="polyline64" /> + <!-- Arrowhead on XXXpoint 5250 8100 - 5250 6240--> + <!-- Circle --> + <circle + cx="2850" + cy="3900" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle68" /> + <!-- Circle --> + <circle + cx="3150" + cy="3900" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle70" /> + <!-- Circle --> + <circle + cx="3450" + cy="3900" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle72" /> + <!-- Circle --> + <circle + cx="1350" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle74" /> + <!-- Circle --> + <circle + cx="1650" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle76" /> + <!-- Circle --> + <circle + cx="1950" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle78" /> + <!-- Circle --> + <circle + cx="4350" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle80" /> + <!-- Circle --> + <circle + cx="4650" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle82" /> + <!-- Circle --> + <circle + cx="4950" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle84" /> + <!-- Line: box --> + <rect + x="750" + y="3450" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect86" /> + <!-- Line: box --> + <rect + x="300" + y="6600" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect88" /> + <!-- Line: box --> + <rect + x="3750" + y="3450" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect90" /> + <!-- Line: box --> + <rect + x="4500" + y="5400" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect92" /> + <!-- Line: box --> + <rect + x="3300" + y="6600" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect94" /> + <!-- Line: box --> + <rect + x="2250" + y="1650" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect96" /> + <!-- Line: box --> + <rect + x="0" + y="9300" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect98" /> + <!-- Line: box --> + <rect + x="1350" + y="8100" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect100" /> + <!-- Line: box --> + <rect + x="3000" + y="9300" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect102" /> + <!-- Line: box --> + <rect + x="4350" + y="8100" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect104" /> + <!-- Line: box --> + <rect + x="1500" + y="5400" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect106" /> + <!-- Text --> + <text + xml:space="preserve" + x="6450" + y="300" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="192" + text-anchor="end" + id="text108">rcu_bh</text> + <!-- Text --> + <text + xml:space="preserve" + x="3150" + y="1950" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text110">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="3150" + y="2250" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text112">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="1650" + y="3750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text114">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1650" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text116">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="4650" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text118">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="4650" + y="3750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text120">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2250" + y="5700" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text122">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2250" + y="6000" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text124">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="6900" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text126">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="7200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text128">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="5250" + y="5700" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text130">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5250" + y="6000" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text132">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="6900" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text134">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="7200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text136">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="450" + y="1350" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="start" + id="text138">struct rcu_state</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="9600" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text140">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="9900" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text142">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="9600" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text144">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="9900" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text146">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="2400" + y="8400" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text148">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2400" + y="8700" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text150">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="5400" + y="8400" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text152">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5400" + y="8700" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text154">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="6000" + y="750" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="192" + text-anchor="end" + id="text156">rcu_sched</text> + <!-- Line --> + <polyline + points="5250,5400 5250,4414 " + style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline158" /> + <!-- Arrowhead on XXXpoint 5250 5400 - 5250 4290--> + </g> +</svg> diff --git a/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg new file mode 100644 index 000000000000..15adcac036c7 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg @@ -0,0 +1,741 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Creator: fig2dev Version 3.2 Patchlevel 5e --> + +<!-- CreationDate: Wed Dec 9 17:32:59 2015 --> + +<!-- Magnification: 2.000 --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="6.1in" + height="8.9in" + viewBox="-44 -44 7288 10738" + id="svg2" + version="1.1" + inkscape:version="0.48.4 r9939" + sodipodi:docname="BigTreePreemptRCUBHdyntick.fig"> + <metadata + id="metadata182"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title></dc:title> + </cc:Work> + </rdf:RDF> + </metadata> + <defs + id="defs180"> + <marker + inkscape:stockid="Arrow1Mend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow1Mend" + style="overflow:visible;"> + <path + id="path3940" + d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z " + style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;" + transform="scale(0.4) rotate(180) translate(10,0)" /> + </marker> + </defs> + <sodipodi:namedview + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1" + objecttolerance="10" + gridtolerance="10" + guidetolerance="10" + inkscape:pageopacity="0" + inkscape:pageshadow="2" + inkscape:window-width="874" + inkscape:window-height="1148" + id="namedview178" + showgrid="false" + inkscape:zoom="1.2097379" + inkscape:cx="274.5" + inkscape:cy="400.49997" + inkscape:window-x="946" + inkscape:window-y="24" + inkscape:window-maximized="0" + inkscape:current-layer="g4" /> + <g + style="stroke-width:.025in; fill:none" + id="g4"> + <!-- Line: box --> + <rect + x="900" + y="0" + width="6300" + height="7350" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect6" /> + <!-- Line: box --> + <rect + x="1200" + y="600" + width="5700" + height="3750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect8" /> + <!-- Line: box --> + <rect + x="5400" + y="4950" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect10" /> + <!-- Line: box --> + <rect + x="450" + y="450" + width="6300" + height="7350" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect12" /> + <!-- Line: box --> + <rect + x="750" + y="1050" + width="5700" + height="3750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect14" /> + <!-- Line: box --> + <rect + x="4950" + y="5400" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect16" /> + <!-- Line --> + <polyline + points="5250,8550 5688,6362 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline18" /> + <!-- Arrowhead on XXXpoint 5250 8550 - 5710 6240--> + <polyline + points="5714 6518 5704 6272 5598 6494 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline20" /> + <!-- Line --> + <polyline + points="4050,9750 4486,7712 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline22" /> + <!-- Arrowhead on XXXpoint 4050 9750 - 4512 7590--> + <polyline + points="4514 7868 4506 7622 4396 7844 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline24" /> + <!-- Line --> + <polyline + points="1040,9750 1476,7712 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline26" /> + <!-- Arrowhead on XXXpoint 1040 9750 - 1502 7590--> + <polyline + points="1504 7868 1496 7622 1386 7844 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline28" /> + <!-- Line --> + <polyline + points="2240,8550 2676,6512 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline30" /> + <!-- Arrowhead on XXXpoint 2240 8550 - 2702 6390--> + <polyline + points="2704 6668 2696 6422 2586 6644 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline32" /> + <!-- Line --> + <polyline + points="4050,9750 5682,6360 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline34" /> + <!-- Arrowhead on XXXpoint 4050 9750 - 5736 6246--> + <polyline + points="5672 6518 5722 6276 5562 6466 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline36" /> + <!-- Line --> + <polyline + points="1010,9750 2642,6360 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline38" /> + <!-- Arrowhead on XXXpoint 1010 9750 - 2696 6246--> + <polyline + points="2632 6518 2682 6276 2522 6466 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline40" /> + <!-- Line: box --> + <rect + x="0" + y="900" + width="6300" + height="7350" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect42" /> + <!-- Line: box --> + <rect + x="300" + y="1500" + width="5700" + height="3750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect44" /> + <!-- Line --> + <polyline + points="1350,3900 2350,3040 " + style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline46" /> + <!-- Arrowhead on XXXpoint 1350 3900 - 2444 2960--> + <!-- Line --> + <polyline + points="4950,3900 3948,3040 " + style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline50" /> + <!-- Arrowhead on XXXpoint 4950 3900 - 3854 2960--> + <!-- Line --> + <polyline + points="4050,7050 4050,4864 " + style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline54" /> + <!-- Arrowhead on XXXpoint 4050 7050 - 4050 4740--> + <!-- Line --> + <polyline + points="1050,7050 1050,4864 " + style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline58" /> + <!-- Arrowhead on XXXpoint 1050 7050 - 1050 4740--> + <!-- Line --> + <polyline + points="2250,5850 2250,4864 " + style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline62" /> + <!-- Arrowhead on XXXpoint 2250 5850 - 2250 4740--> + <!-- Line --> + <polyline + points="2250,8550 2250,6814 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline66" /> + <!-- Arrowhead on XXXpoint 2250 8550 - 2250 6690--> + <!-- Line --> + <polyline + points="1050,9750 1050,8014 " + style="stroke:#00ff00;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline70" /> + <!-- Arrowhead on XXXpoint 1050 9750 - 1050 7890--> + <!-- Line --> + <polyline + points="4050,9750 4050,8014 " + style="stroke:#00ff00;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline74" /> + <!-- Arrowhead on XXXpoint 4050 9750 - 4050 7890--> + <!-- Line --> + <polyline + points="5250,8550 5250,6814 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline78" /> + <!-- Arrowhead on XXXpoint 5250 8550 - 5250 6690--> + <!-- Circle --> + <circle + cx="2850" + cy="4350" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle82" /> + <!-- Circle --> + <circle + cx="3150" + cy="4350" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle84" /> + <!-- Circle --> + <circle + cx="3450" + cy="4350" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle86" /> + <!-- Circle --> + <circle + cx="1350" + cy="5550" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle88" /> + <!-- Circle --> + <circle + cx="1650" + cy="5550" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle90" /> + <!-- Circle --> + <circle + cx="1950" + cy="5550" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle92" /> + <!-- Circle --> + <circle + cx="4350" + cy="5550" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle94" /> + <!-- Circle --> + <circle + cx="4650" + cy="5550" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle96" /> + <!-- Circle --> + <circle + cx="4950" + cy="5550" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle98" /> + <!-- Line: box --> + <rect + x="750" + y="3900" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect100" /> + <!-- Line: box --> + <rect + x="300" + y="7050" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect102" /> + <!-- Line: box --> + <rect + x="3750" + y="3900" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect104" /> + <!-- Line: box --> + <rect + x="4500" + y="5850" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect106" /> + <!-- Line: box --> + <rect + x="3300" + y="7050" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect108" /> + <!-- Line: box --> + <rect + x="2250" + y="2100" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect110" /> + <!-- Line: box --> + <rect + x="0" + y="9750" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect112" /> + <!-- Line: box --> + <rect + x="1350" + y="8550" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect114" /> + <!-- Line: box --> + <rect + x="3000" + y="9750" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect116" /> + <!-- Line: box --> + <rect + x="4350" + y="8550" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect118" /> + <!-- Line: box --> + <rect + x="1500" + y="5850" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect120" /> + <!-- Text --> + <text + xml:space="preserve" + x="6450" + y="750" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="192" + text-anchor="end" + id="text122">rcu_bh</text> + <!-- Text --> + <text + xml:space="preserve" + x="3150" + y="2400" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text124">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="3150" + y="2700" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text126">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="1650" + y="4200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text128">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1650" + y="4500" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text130">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="4650" + y="4500" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text132">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="4650" + y="4200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text134">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2250" + y="6150" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text136">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2250" + y="6450" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text138">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="7350" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text140">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="7650" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text142">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="5250" + y="6150" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text144">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5250" + y="6450" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text146">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="7350" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text148">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="7650" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text150">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="450" + y="1800" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="start" + id="text152">struct rcu_state</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="10050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text154">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="10350" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text156">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="10050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text158">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="10350" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text160">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="2400" + y="8850" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text162">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2400" + y="9150" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text164">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="5400" + y="8850" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text166">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5400" + y="9150" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text168">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="6900" + y="300" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="192" + text-anchor="end" + id="text170">rcu_preempt</text> + <!-- Text --> + <text + xml:space="preserve" + x="6000" + y="1200" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="192" + text-anchor="end" + id="text172">rcu_sched</text> + <!-- Line --> + <polyline + points="5250,5850 5250,4864 " + style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline174" /> + <!-- Arrowhead on XXXpoint 5250 5850 - 5250 4740--> + </g> +</svg> diff --git a/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg new file mode 100644 index 000000000000..bbc3801470d0 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg @@ -0,0 +1,858 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Creator: fig2dev Version 3.2 Patchlevel 5e --> + +<!-- CreationDate: Wed Dec 9 17:29:48 2015 --> + +<!-- Magnification: 2.000 --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="7.4in" + height="9.9in" + viewBox="-44 -44 8938 11938" + id="svg2" + version="1.1" + inkscape:version="0.48.4 r9939" + sodipodi:docname="BigTreePreemptRCUBHdyntickCB.svg"> + <metadata + id="metadata212"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title></dc:title> + </cc:Work> + </rdf:RDF> + </metadata> + <defs + id="defs210"> + <marker + inkscape:stockid="Arrow1Mend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow1Mend" + style="overflow:visible;"> + <path + id="path3970" + d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z " + style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;" + transform="scale(0.4) rotate(180) translate(10,0)" /> + </marker> + </defs> + <sodipodi:namedview + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1" + objecttolerance="10" + gridtolerance="10" + guidetolerance="10" + inkscape:pageopacity="0" + inkscape:pageshadow="2" + inkscape:window-width="881" + inkscape:window-height="1128" + id="namedview208" + showgrid="false" + inkscape:zoom="1.0195195" + inkscape:cx="333" + inkscape:cy="445.49997" + inkscape:window-x="936" + inkscape:window-y="24" + inkscape:window-maximized="0" + inkscape:current-layer="g4" /> + <g + style="stroke-width:.025in; fill:none" + id="g4"> + <!-- Line: box --> + <rect + x="900" + y="0" + width="6300" + height="7350" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect6" /> + <!-- Line: box --> + <rect + x="1200" + y="600" + width="5700" + height="3750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect8" /> + <!-- Line: box --> + <rect + x="5400" + y="4950" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect10" /> + <!-- Line: box --> + <rect + x="450" + y="450" + width="6300" + height="7350" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect12" /> + <!-- Line: box --> + <rect + x="750" + y="1050" + width="5700" + height="3750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect14" /> + <!-- Line: box --> + <rect + x="4950" + y="5400" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect16" /> + <!-- Line --> + <polyline + points="5250,8550 5688,6362 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline18" /> + <!-- Arrowhead on XXXpoint 5250 8550 - 5710 6240--> + <polyline + points="5714 6518 5704 6272 5598 6494 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline20" /> + <!-- Line --> + <polyline + points="4050,9750 4486,7712 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline22" /> + <!-- Arrowhead on XXXpoint 4050 9750 - 4512 7590--> + <polyline + points="4514 7868 4506 7622 4396 7844 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline24" /> + <!-- Line --> + <polyline + points="1040,9750 1476,7712 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline26" /> + <!-- Arrowhead on XXXpoint 1040 9750 - 1502 7590--> + <polyline + points="1504 7868 1496 7622 1386 7844 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline28" /> + <!-- Line --> + <polyline + points="2240,8550 2676,6512 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline30" /> + <!-- Arrowhead on XXXpoint 2240 8550 - 2702 6390--> + <polyline + points="2704 6668 2696 6422 2586 6644 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline32" /> + <!-- Line --> + <polyline + points="4050,9600 5692,6062 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline34" /> + <!-- Arrowhead on XXXpoint 4050 9600 - 5744 5948--> + <polyline + points="5682 6220 5730 5978 5574 6170 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline36" /> + <!-- Line --> + <polyline + points="1086,9600 2728,6062 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline38" /> + <!-- Arrowhead on XXXpoint 1086 9600 - 2780 5948--> + <polyline + points="2718 6220 2766 5978 2610 6170 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline40" /> + <!-- Line: box --> + <rect + x="0" + y="900" + width="6300" + height="7350" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect42" /> + <!-- Line: box --> + <rect + x="300" + y="1500" + width="5700" + height="3750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect44" /> + <!-- Line --> + <polyline + points="1350,3900 2350,3040 " + style="stroke:#00d1d1;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline46" /> + <!-- Arrowhead on XXXpoint 1350 3900 - 2444 2960--> + <!-- Line --> + <polyline + points="4950,3900 3948,3040 " + style="stroke:#00d1d1;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline50" /> + <!-- Arrowhead on XXXpoint 4950 3900 - 3854 2960--> + <!-- Line --> + <polyline + points="4050,7050 4050,4864 " + style="stroke:#00d1d1;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline54" /> + <!-- Arrowhead on XXXpoint 4050 7050 - 4050 4740--> + <!-- Line --> + <polyline + points="1050,7050 1050,4864 " + style="stroke:#00d1d1;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline58" /> + <!-- Arrowhead on XXXpoint 1050 7050 - 1050 4740--> + <!-- Line --> + <polyline + points="2250,5850 2250,4864 " + style="stroke:#00d1d1;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline62" /> + <!-- Arrowhead on XXXpoint 2250 5850 - 2250 4740--> + <!-- Line --> + <polyline + points="2250,8550 2250,6814 " + style="stroke:#00ff00;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline66" /> + <!-- Arrowhead on XXXpoint 2250 8550 - 2250 6690--> + <!-- Line --> + <polyline + points="1050,9750 1050,8014 " + style="stroke:#00ff00;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline70" /> + <!-- Arrowhead on XXXpoint 1050 9750 - 1050 7890--> + <!-- Line --> + <polyline + points="4050,9750 4050,8014 " + style="stroke:#00ff00;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline74" /> + <!-- Arrowhead on XXXpoint 4050 9750 - 4050 7890--> + <!-- Line --> + <polyline + points="5250,8550 5250,6814 " + style="stroke:#00ff00;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline78" /> + <!-- Arrowhead on XXXpoint 5250 8550 - 5250 6690--> + <!-- Line --> + <polyline + points="6000,6300 8048,7910 " + style="stroke:#87cfff;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)" + id="polyline82" /> + <!-- Arrowhead on XXXpoint 6000 6300 - 8146 7986--> + <!-- Circle --> + <circle + cx="2850" + cy="4350" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle86" /> + <!-- Circle --> + <circle + cx="3150" + cy="4350" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle88" /> + <!-- Circle --> + <circle + cx="3450" + cy="4350" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle90" /> + <!-- Circle --> + <circle + cx="1350" + cy="5550" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle92" /> + <!-- Circle --> + <circle + cx="1650" + cy="5550" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle94" /> + <!-- Circle --> + <circle + cx="1950" + cy="5550" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle96" /> + <!-- Circle --> + <circle + cx="4350" + cy="5550" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle98" /> + <!-- Circle --> + <circle + cx="4650" + cy="5550" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle100" /> + <!-- Circle --> + <circle + cx="4950" + cy="5550" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle102" /> + <!-- Line: box --> + <rect + x="7350" + y="7950" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect104" /> + <!-- Line: box --> + <rect + x="7350" + y="9450" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect106" /> + <!-- Line --> + <polyline + points="8100,8850 8100,9384 " + style="stroke:#000000;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)" + id="polyline108" /> + <!-- Arrowhead on XXXpoint 8100 8850 - 8100 9510--> + <!-- Line: box --> + <rect + x="7350" + y="10950" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect112" /> + <!-- Line --> + <polyline + points="8100,10350 8100,10884 " + style="stroke:#000000;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)" + id="polyline114" /> + <!-- Arrowhead on XXXpoint 8100 10350 - 8100 11010--> + <!-- Line: box --> + <rect + x="750" + y="3900" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect118" /> + <!-- Line: box --> + <rect + x="300" + y="7050" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect120" /> + <!-- Line: box --> + <rect + x="3750" + y="3900" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect122" /> + <!-- Line: box --> + <rect + x="4500" + y="5850" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect124" /> + <!-- Line: box --> + <rect + x="3300" + y="7050" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect126" /> + <!-- Line: box --> + <rect + x="2250" + y="2100" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect128" /> + <!-- Line: box --> + <rect + x="0" + y="9750" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect130" /> + <!-- Line: box --> + <rect + x="1350" + y="8550" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect132" /> + <!-- Line: box --> + <rect + x="3000" + y="9750" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect134" /> + <!-- Line: box --> + <rect + x="4350" + y="8550" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect136" /> + <!-- Line: box --> + <rect + x="1500" + y="5850" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect138" /> + <!-- Text --> + <text + xml:space="preserve" + x="8100" + y="8250" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text140">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="8100" + y="8550" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text142">rcu_head</text> + <!-- Text --> + <text + xml:space="preserve" + x="8100" + y="9750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text144">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="8100" + y="10050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text146">rcu_head</text> + <!-- Text --> + <text + xml:space="preserve" + x="8100" + y="11250" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text148">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="8100" + y="11550" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text150">rcu_head</text> + <!-- Text --> + <text + xml:space="preserve" + x="6000" + y="1200" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="192" + text-anchor="end" + id="text152">rcu_sched</text> + <!-- Text --> + <text + xml:space="preserve" + x="6450" + y="750" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="192" + text-anchor="end" + id="text154">rcu_bh</text> + <!-- Text --> + <text + xml:space="preserve" + x="3150" + y="2400" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text156">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="3150" + y="2700" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text158">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="1650" + y="4200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text160">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1650" + y="4500" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text162">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="4650" + y="4500" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text164">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="4650" + y="4200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text166">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2250" + y="6150" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text168">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2250" + y="6450" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text170">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="7350" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text172">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="7650" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text174">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="5250" + y="6150" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text176">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5250" + y="6450" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text178">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="7350" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text180">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="7650" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text182">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="450" + y="1800" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="start" + id="text184">struct rcu_state</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="10050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text186">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="10350" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text188">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="10050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text190">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="10350" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text192">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="2400" + y="8850" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text194">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2400" + y="9150" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text196">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="5400" + y="8850" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text198">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5400" + y="9150" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text200">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="6900" + y="300" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="192" + text-anchor="end" + id="text202">rcu_preempt</text> + <!-- Line --> + <polyline + points="5250,5850 5250,4864 " + style="stroke:#00d1d1;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline204" /> + <!-- Arrowhead on XXXpoint 5250 5850 - 5250 4740--> + </g> +</svg> diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html new file mode 100644 index 000000000000..7eb47ac25ad7 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html @@ -0,0 +1,1333 @@ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" + "http://www.w3.org/TR/html4/loose.dtd"> + <html> + <head><title>A Tour Through TREE_RCU's Data Structures [LWN.net]</title> + <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=iso-8859-1"> + + <p>January 27, 2016</p> + <p>This article was contributed by Paul E. McKenney</p> + +<h3>Introduction</h3> + +This document describes RCU's major data structures and their relationship +to each other. + +<ol> +<li> <a href="#Data-Structure Relationships"> + Data-Structure Relationships</a> +<li> <a href="#The rcu_state Structure"> + The <tt>rcu_state</tt> Structure</a> +<li> <a href="#The rcu_node Structure"> + The <tt>rcu_node</tt> Structure</a> +<li> <a href="#The rcu_data Structure"> + The <tt>rcu_data</tt> Structure</a> +<li> <a href="#The rcu_dynticks Structure"> + The <tt>rcu_dynticks</tt> Structure</a> +<li> <a href="#The rcu_head Structure"> + The <tt>rcu_head</tt> Structure</a> +<li> <a href="#RCU-Specific Fields in the task_struct Structure"> + RCU-Specific Fields in the <tt>task_struct</tt> Structure</a> +<li> <a href="#Accessor Functions"> + Accessor Functions</a> +</ol> + +At the end we have the +<a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>. + +<h3><a name="Data-Structure Relationships">Data-Structure Relationships</a></h3> + +<p>RCU is for all intents and purposes a large state machine, and its +data structures maintain the state in such a way as to allow RCU readers +to execute extremely quickly, while also processing the RCU grace periods +requested by updaters in an efficient and extremely scalable fashion. +The efficiency and scalability of RCU updaters is provided primarily +by a combining tree, as shown below: + +</p><p><img src="BigTreeClassicRCU.svg" alt="BigTreeClassicRCU.svg" width="30%"> + +</p><p>This diagram shows an enclosing <tt>rcu_state</tt> structure +containing a tree of <tt>rcu_node</tt> structures. +Each leaf node of the <tt>rcu_node</tt> tree has up to 16 +<tt>rcu_data</tt> structures associated with it, so that there +are <tt>NR_CPUS</tt> number of <tt>rcu_data</tt> structures, +one for each possible CPU. +This structure is adjusted at boot time, if needed, to handle the +common case where <tt>nr_cpu_ids</tt> is much less than +<tt>NR_CPUs</tt>. +For example, a number of Linux distributions set <tt>NR_CPUs=4096</tt>, +which results in a three-level <tt>rcu_node</tt> tree. +If the actual hardware has only 16 CPUs, RCU will adjust itself +at boot time, resulting in an <tt>rcu_node</tt> tree with only a single node. + +</p><p>The purpose of this combining tree is to allow per-CPU events +such as quiescent states, dyntick-idle transitions, +and CPU hotplug operations to be processed efficiently +and scalably. +Quiescent states are recorded by the per-CPU <tt>rcu_data</tt> structures, +and other events are recorded by the leaf-level <tt>rcu_node</tt> +structures. +All of these events are combined at each level of the tree until finally +grace periods are completed at the tree's root <tt>rcu_node</tt> +structure. +A grace period can be completed at the root once every CPU +(or, in the case of <tt>CONFIG_PREEMPT_RCU</tt>, task) +has passed through a quiescent state. +Once a grace period has completed, record of that fact is propagated +back down the tree. + +</p><p>As can be seen from the diagram, on a 64-bit system +a two-level tree with 64 leaves can accommodate 1,024 CPUs, with a fanout +of 64 at the root and a fanout of 16 at the leaves. + +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + Why isn't the fanout at the leaves also 64? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + Because there are more types of events that affect the leaf-level + <tt>rcu_node</tt> structures than further up the tree. + Therefore, if the leaf <tt>rcu_node</tt> structures have fanout of + 64, the contention on these structures' <tt>->structures</tt> + becomes excessive. + Experimentation on a wide variety of systems has shown that a fanout + of 16 works well for the leaves of the <tt>rcu_node</tt> tree. + </font> + + <p><font color="ffffff">Of course, further experience with + systems having hundreds or thousands of CPUs may demonstrate + that the fanout for the non-leaf <tt>rcu_node</tt> structures + must also be reduced. + Such reduction can be easily carried out when and if it proves + necessary. + In the meantime, if you are using such a system and running into + contention problems on the non-leaf <tt>rcu_node</tt> structures, + you may use the <tt>CONFIG_RCU_FANOUT</tt> kernel configuration + parameter to reduce the non-leaf fanout as needed. + </font> + + <p><font color="ffffff">Kernels built for systems with + strong NUMA characteristics might also need to adjust + <tt>CONFIG_RCU_FANOUT</tt> so that the domains of the + <tt>rcu_node</tt> structures align with hardware boundaries. + However, there has thus far been no need for this. +</font></td></tr> +<tr><td> </td></tr> +</table> + +<p>If your system has more than 1,024 CPUs (or more than 512 CPUs on +a 32-bit system), then RCU will automatically add more levels to the +tree. +For example, if you are crazy enough to build a 64-bit system with 65,536 +CPUs, RCU would configure the <tt>rcu_node</tt> tree as follows: + +</p><p><img src="HugeTreeClassicRCU.svg" alt="HugeTreeClassicRCU.svg" width="50%"> + +</p><p>RCU currently permits up to a four-level tree, which on a 64-bit system +accommodates up to 4,194,304 CPUs, though only a mere 524,288 CPUs for +32-bit systems. +On the other hand, you can set <tt>CONFIG_RCU_FANOUT</tt> to be +as small as 2 if you wish, which would permit only 16 CPUs, which +is useful for testing. + +</p><p>This multi-level combining tree allows us to get most of the +performance and scalability +benefits of partitioning, even though RCU grace-period detection is +inherently a global operation. +The trick here is that only the last CPU to report a quiescent state +into a given <tt>rcu_node</tt> structure need advance to the <tt>rcu_node</tt> +structure at the next level up the tree. +This means that at the leaf-level <tt>rcu_node</tt> structure, only +one access out of sixteen will progress up the tree. +For the internal <tt>rcu_node</tt> structures, the situation is even +more extreme: Only one access out of sixty-four will progress up +the tree. +Because the vast majority of the CPUs do not progress up the tree, +the lock contention remains roughly constant up the tree. +No matter how many CPUs there are in the system, at most 64 quiescent-state +reports per grace period will progress all the way to the root +<tt>rcu_node</tt> structure, thus ensuring that the lock contention +on that root <tt>rcu_node</tt> structure remains acceptably low. + +</p><p>In effect, the combining tree acts like a big shock absorber, +keeping lock contention under control at all tree levels regardless +of the level of loading on the system. + +</p><p>The Linux kernel actually supports multiple flavors of RCU +running concurrently, so RCU builds separate data structures for each +flavor. +For example, for <tt>CONFIG_TREE_RCU=y</tt> kernels, RCU provides +rcu_sched and rcu_bh, as shown below: + +</p><p><img src="BigTreeClassicRCUBH.svg" alt="BigTreeClassicRCUBH.svg" width="33%"> + +</p><p>Energy efficiency is increasingly important, and for that +reason the Linux kernel provides <tt>CONFIG_NO_HZ_IDLE</tt>, which +turns off the scheduling-clock interrupts on idle CPUs, which in +turn allows those CPUs to attain deeper sleep states and to consume +less energy. +CPUs whose scheduling-clock interrupts have been turned off are +said to be in <i>dyntick-idle mode</i>. +RCU must handle dyntick-idle CPUs specially +because RCU would otherwise wake up each CPU on every grace period, +which would defeat the whole purpose of <tt>CONFIG_NO_HZ_IDLE</tt>. +RCU uses the <tt>rcu_dynticks</tt> structure to track +which CPUs are in dyntick idle mode, as shown below: + +</p><p><img src="BigTreeClassicRCUBHdyntick.svg" alt="BigTreeClassicRCUBHdyntick.svg" width="33%"> + +</p><p>However, if a CPU is in dyntick-idle mode, it is in that mode +for all flavors of RCU. +Therefore, a single <tt>rcu_dynticks</tt> structure is allocated per +CPU, and all of a given CPU's <tt>rcu_data</tt> structures share +that <tt>rcu_dynticks</tt>, as shown in the figure. + +</p><p>Kernels built with <tt>CONFIG_PREEMPT_RCU</tt> support +rcu_preempt in addition to rcu_sched and rcu_bh, as shown below: + +</p><p><img src="BigTreePreemptRCUBHdyntick.svg" alt="BigTreePreemptRCUBHdyntick.svg" width="35%"> + +</p><p>RCU updaters wait for normal grace periods by registering +RCU callbacks, either directly via <tt>call_rcu()</tt> and +friends (namely <tt>call_rcu_bh()</tt> and <tt>call_rcu_sched()</tt>), +there being a separate interface per flavor of RCU) +or indirectly via <tt>synchronize_rcu()</tt> and friends. +RCU callbacks are represented by <tt>rcu_head</tt> structures, +which are queued on <tt>rcu_data</tt> structures while they are +waiting for a grace period to elapse, as shown in the following figure: + +</p><p><img src="BigTreePreemptRCUBHdyntickCB.svg" alt="BigTreePreemptRCUBHdyntickCB.svg" width="40%"> + +</p><p>This figure shows how <tt>TREE_RCU</tt>'s and +<tt>PREEMPT_RCU</tt>'s major data structures are related. +Lesser data structures will be introduced with the algorithms that +make use of them. + +</p><p>Note that each of the data structures in the above figure has +its own synchronization: + +<p><ol> +<li> Each <tt>rcu_state</tt> structures has a lock and a mutex, + and some fields are protected by the corresponding root + <tt>rcu_node</tt> structure's lock. +<li> Each <tt>rcu_node</tt> structure has a spinlock. +<li> The fields in <tt>rcu_data</tt> are private to the corresponding + CPU, although a few can be read and written by other CPUs. +<li> Similarly, the fields in <tt>rcu_dynticks</tt> are private + to the corresponding CPU, although a few can be read by + other CPUs. +</ol> + +<p>It is important to note that different data structures can have +very different ideas about the state of RCU at any given time. +For but one example, awareness of the start or end of a given RCU +grace period propagates slowly through the data structures. +This slow propagation is absolutely necessary for RCU to have good +read-side performance. +If this balkanized implementation seems foreign to you, one useful +trick is to consider each instance of these data structures to be +a different person, each having the usual slightly different +view of reality. + +</p><p>The general role of each of these data structures is as +follows: + +</p><ol> +<li> <tt>rcu_state</tt>: + This structure forms the interconnection between the + <tt>rcu_node</tt> and <tt>rcu_data</tt> structures, + tracks grace periods, serves as short-term repository + for callbacks orphaned by CPU-hotplug events, + maintains <tt>rcu_barrier()</tt> state, + tracks expedited grace-period state, + and maintains state used to force quiescent states when + grace periods extend too long, +<li> <tt>rcu_node</tt>: This structure forms the combining + tree that propagates quiescent-state + information from the leaves to the root, and also propagates + grace-period information from the root to the leaves. + It provides local copies of the grace-period state in order + to allow this information to be accessed in a synchronized + manner without suffering the scalability limitations that + would otherwise be imposed by global locking. + In <tt>CONFIG_PREEMPT_RCU</tt> kernels, it manages the lists + of tasks that have blocked while in their current + RCU read-side critical section. + In <tt>CONFIG_PREEMPT_RCU</tt> with + <tt>CONFIG_RCU_BOOST</tt>, it manages the + per-<tt>rcu_node</tt> priority-boosting + kernel threads (kthreads) and state. + Finally, it records CPU-hotplug state in order to determine + which CPUs should be ignored during a given grace period. +<li> <tt>rcu_data</tt>: This per-CPU structure is the + focus of quiescent-state detection and RCU callback queuing. + It also tracks its relationship to the corresponding leaf + <tt>rcu_node</tt> structure to allow more-efficient + propagation of quiescent states up the <tt>rcu_node</tt> + combining tree. + Like the <tt>rcu_node</tt> structure, it provides a local + copy of the grace-period information to allow for-free + synchronized + access to this information from the corresponding CPU. + Finally, this structure records past dyntick-idle state + for the corresponding CPU and also tracks statistics. +<li> <tt>rcu_dynticks</tt>: + This per-CPU structure tracks the current dyntick-idle + state for the corresponding CPU. + Unlike the other three structures, the <tt>rcu_dynticks</tt> + structure is not replicated per RCU flavor. +<li> <tt>rcu_head</tt>: + This structure represents RCU callbacks, and is the + only structure allocated and managed by RCU users. + The <tt>rcu_head</tt> structure is normally embedded + within the RCU-protected data structure. +</ol> + +<p>If all you wanted from this article was a general notion of how +RCU's data structures are related, you are done. +Otherwise, each of the following sections give more details on +the <tt>rcu_state</tt>, <tt>rcu_node</tt>, <tt>rcu_data</tt>, +and <tt>rcu_dynticks</tt> data structures. + +<h3><a name="The rcu_state Structure"> +The <tt>rcu_state</tt> Structure</a></h3> + +<p>The <tt>rcu_state</tt> structure is the base structure that +represents a flavor of RCU. +This structure forms the interconnection between the +<tt>rcu_node</tt> and <tt>rcu_data</tt> structures, +tracks grace periods, contains the lock used to +synchronize with CPU-hotplug events, +and maintains state used to force quiescent states when +grace periods extend too long, + +</p><p>A few of the <tt>rcu_state</tt> structure's fields are discussed, +singly and in groups, in the following sections. +The more specialized fields are covered in the discussion of their +use. + +<h5>Relationship to rcu_node and rcu_data Structures</h5> + +This portion of the <tt>rcu_state</tt> structure is declared +as follows: + +<pre> + 1 struct rcu_node node[NUM_RCU_NODES]; + 2 struct rcu_node *level[NUM_RCU_LVLS + 1]; + 3 struct rcu_data __percpu *rda; +</pre> + +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + Wait a minute! + You said that the <tt>rcu_node</tt> structures formed a tree, + but they are declared as a flat array! + What gives? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + The tree is laid out in the array. + The first node In the array is the head, the next set of nodes in the + array are children of the head node, and so on until the last set of + nodes in the array are the leaves. + </font> + + <p><font color="ffffff">See the following diagrams to see how + this works. +</font></td></tr> +<tr><td> </td></tr> +</table> + +<p>The <tt>rcu_node</tt> tree is embedded into the +<tt>->node[]</tt> array as shown in the following figure: + +</p><p><img src="TreeMapping.svg" alt="TreeMapping.svg" width="40%"> + +</p><p>One interesting consequence of this mapping is that a +breadth-first traversal of the tree is implemented as a simple +linear scan of the array, which is in fact what the +<tt>rcu_for_each_node_breadth_first()</tt> macro does. +This macro is used at the beginning and ends of grace periods. + +</p><p>Each entry of the <tt>->level</tt> array references +the first <tt>rcu_node</tt> structure on the corresponding level +of the tree, for example, as shown below: + +</p><p><img src="TreeMappingLevel.svg" alt="TreeMappingLevel.svg" width="40%"> + +</p><p>The zero<sup>th</sup> element of the array references the root +<tt>rcu_node</tt> structure, the first element references the +first child of the root <tt>rcu_node</tt>, and finally the second +element references the first leaf <tt>rcu_node</tt> structure. + +</p><p>For whatever it is worth, if you draw the tree to be tree-shaped +rather than array-shaped, it is easy to draw a planar representation: + +</p><p><img src="TreeLevel.svg" alt="TreeLevel.svg" width="60%"> + +</p><p>Finally, the <tt>->rda</tt> field references a per-CPU +pointer to the corresponding CPU's <tt>rcu_data</tt> structure. + +</p><p>All of these fields are constant once initialization is complete, +and therefore need no protection. + +<h5>Grace-Period Tracking</h5> + +<p>This portion of the <tt>rcu_state</tt> structure is declared +as follows: + +<pre> + 1 unsigned long gpnum; + 2 unsigned long completed; +</pre> + +<p>RCU grace periods are numbered, and +the <tt>->gpnum</tt> field contains the number of the grace +period that started most recently. +The <tt>->completed</tt> field contains the number of the +grace period that completed most recently. +If the two fields are equal, the RCU grace period that most recently +started has already completed, and therefore the corresponding +flavor of RCU is idle. +If <tt>->gpnum</tt> is one greater than <tt>->completed</tt>, +then <tt>->gpnum</tt> gives the number of the current RCU +grace period, which has not yet completed. +Any other combination of values indicates that something is broken. +These two fields are protected by the root <tt>rcu_node</tt>'s +<tt>->lock</tt> field. + +</p><p>There are <tt>->gpnum</tt> and <tt>->completed</tt> fields +in the <tt>rcu_node</tt> and <tt>rcu_data</tt> structures +as well. +The fields in the <tt>rcu_state</tt> structure represent the +most current values, and those of the other structures are compared +in order to detect the start of a new grace period in a distributed +fashion. +The values flow from <tt>rcu_state</tt> to <tt>rcu_node</tt> +(down the tree from the root to the leaves) to <tt>rcu_data</tt>. + +<h5>Miscellaneous</h5> + +<p>This portion of the <tt>rcu_state</tt> structure is declared +as follows: + +<pre> + 1 unsigned long gp_max; + 2 char abbr; + 3 char *name; +</pre> + +<p>The <tt>->gp_max</tt> field tracks the duration of the longest +grace period in jiffies. +It is protected by the root <tt>rcu_node</tt>'s <tt>->lock</tt>. + +<p>The <tt>->name</tt> field points to the name of the RCU flavor +(for example, “rcu_sched”), and is constant. +The <tt>->abbr</tt> field contains a one-character abbreviation, +for example, “s” for RCU-sched. + +<h3><a name="The rcu_node Structure"> +The <tt>rcu_node</tt> Structure</a></h3> + +<p>The <tt>rcu_node</tt> structures form the combining +tree that propagates quiescent-state +information from the leaves to the root and also that propagates +grace-period information from the root down to the leaves. +They provides local copies of the grace-period state in order +to allow this information to be accessed in a synchronized +manner without suffering the scalability limitations that +would otherwise be imposed by global locking. +In <tt>CONFIG_PREEMPT_RCU</tt> kernels, they manage the lists +of tasks that have blocked while in their current +RCU read-side critical section. +In <tt>CONFIG_PREEMPT_RCU</tt> with +<tt>CONFIG_RCU_BOOST</tt>, they manage the +per-<tt>rcu_node</tt> priority-boosting +kernel threads (kthreads) and state. +Finally, they record CPU-hotplug state in order to determine +which CPUs should be ignored during a given grace period. + +</p><p>The <tt>rcu_node</tt> structure's fields are discussed, +singly and in groups, in the following sections. + +<h5>Connection to Combining Tree</h5> + +<p>This portion of the <tt>rcu_node</tt> structure is declared +as follows: + +<pre> + 1 struct rcu_node *parent; + 2 u8 level; + 3 u8 grpnum; + 4 unsigned long grpmask; + 5 int grplo; + 6 int grphi; +</pre> + +<p>The <tt>->parent</tt> pointer references the <tt>rcu_node</tt> +one level up in the tree, and is <tt>NULL</tt> for the root +<tt>rcu_node</tt>. +The RCU implementation makes heavy use of this field to push quiescent +states up the tree. +The <tt>->level</tt> field gives the level in the tree, with +the root being at level zero, its children at level one, and so on. +The <tt>->grpnum</tt> field gives this node's position within +the children of its parent, so this number can range between 0 and 31 +on 32-bit systems and between 0 and 63 on 64-bit systems. +The <tt>->level</tt> and <tt>->grpnum</tt> fields are +used only during initialization and for tracing. +The <tt>->grpmask</tt> field is the bitmask counterpart of +<tt>->grpnum</tt>, and therefore always has exactly one bit set. +This mask is used to clear the bit corresponding to this <tt>rcu_node</tt> +structure in its parent's bitmasks, which are described later. +Finally, the <tt>->grplo</tt> and <tt>->grphi</tt> fields +contain the lowest and highest numbered CPU served by this +<tt>rcu_node</tt> structure, respectively. + +</p><p>All of these fields are constant, and thus do not require any +synchronization. + +<h5>Synchronization</h5> + +<p>This field of the <tt>rcu_node</tt> structure is declared +as follows: + +<pre> + 1 raw_spinlock_t lock; +</pre> + +<p>This field is used to protect the remaining fields in this structure, +unless otherwise stated. +That said, all of the fields in this structure can be accessed without +locking for tracing purposes. +Yes, this can result in confusing traces, but better some tracing confusion +than to be heisenbugged out of existence. + +<h5>Grace-Period Tracking</h5> + +<p>This portion of the <tt>rcu_node</tt> structure is declared +as follows: + +<pre> + 1 unsigned long gpnum; + 2 unsigned long completed; +</pre> + +<p>These fields are the counterparts of the fields of the same name in +the <tt>rcu_state</tt> structure. +They each may lag up to one behind their <tt>rcu_state</tt> +counterparts. +If a given <tt>rcu_node</tt> structure's <tt>->gpnum</tt> and +<tt>->complete</tt> fields are equal, then this <tt>rcu_node</tt> +structure believes that RCU is idle. +Otherwise, as with the <tt>rcu_state</tt> structure, +the <tt>->gpnum</tt> field will be one greater than the +<tt>->complete</tt> fields, with <tt>->gpnum</tt> +indicating which grace period this <tt>rcu_node</tt> believes +is still being waited for. + +</p><p>The <tt>>gpnum</tt> field of each <tt>rcu_node</tt> +structure is updated at the beginning +of each grace period, and the <tt>->completed</tt> fields are +updated at the end of each grace period. + +<h5>Quiescent-State Tracking</h5> + +<p>These fields manage the propagation of quiescent states up the +combining tree. + +</p><p>This portion of the <tt>rcu_node</tt> structure has fields +as follows: + +<pre> + 1 unsigned long qsmask; + 2 unsigned long expmask; + 3 unsigned long qsmaskinit; + 4 unsigned long expmaskinit; +</pre> + +<p>The <tt>->qsmask</tt> field tracks which of this +<tt>rcu_node</tt> structure's children still need to report +quiescent states for the current normal grace period. +Such children will have a value of 1 in their corresponding bit. +Note that the leaf <tt>rcu_node</tt> structures should be +thought of as having <tt>rcu_data</tt> structures as their +children. +Similarly, the <tt>->expmask</tt> field tracks which +of this <tt>rcu_node</tt> structure's children still need to report +quiescent states for the current expedited grace period. +An expedited grace period has +the same conceptual properties as a normal grace period, but the +expedited implementation accepts extreme CPU overhead to obtain +much lower grace-period latency, for example, consuming a few +tens of microseconds worth of CPU time to reduce grace-period +duration from milliseconds to tens of microseconds. +The <tt>->qsmaskinit</tt> field tracks which of this +<tt>rcu_node</tt> structure's children cover for at least +one online CPU. +This mask is used to initialize <tt>->qsmask</tt>, +and <tt>->expmaskinit</tt> is used to initialize +<tt>->expmask</tt> and the beginning of the +normal and expedited grace periods, respectively. + +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + Why are these bitmasks protected by locking? + Come on, haven't you heard of atomic instructions??? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + Lockless grace-period computation! Such a tantalizing possibility! + </font> + + <p><font color="ffffff">But consider the following sequence of events: + </font> + + <ol> + <li> <font color="ffffff">CPU 0 has been in dyntick-idle + mode for quite some time. + When it wakes up, it notices that the current RCU + grace period needs it to report in, so it sets a + flag where the scheduling clock interrupt will find it. + </font><p> + <li> <font color="ffffff">Meanwhile, CPU 1 is running + <tt>force_quiescent_state()</tt>, + and notices that CPU 0 has been in dyntick idle mode, + which qualifies as an extended quiescent state. + </font><p> + <li> <font color="ffffff">CPU 0's scheduling clock + interrupt fires in the + middle of an RCU read-side critical section, and notices + that the RCU core needs something, so commences RCU softirq + processing. + </font> + <p> + <li> <font color="ffffff">CPU 0's softirq handler + executes and is just about ready + to report its quiescent state up the <tt>rcu_node</tt> + tree. + </font><p> + <li> <font color="ffffff">But CPU 1 beats it to the punch, + completing the current + grace period and starting a new one. + </font><p> + <li> <font color="ffffff">CPU 0 now reports its quiescent + state for the wrong + grace period. + That grace period might now end before the RCU read-side + critical section. + If that happens, disaster will ensue. + </font> + </ol> + + <p><font color="ffffff">So the locking is absolutely required in + order to coordinate + clearing of the bits with the grace-period numbers in + <tt>->gpnum</tt> and <tt>->completed</tt>. +</font></td></tr> +<tr><td> </td></tr> +</table> + +<h5>Blocked-Task Management</h5> + +<p><tt>PREEMPT_RCU</tt> allows tasks to be preempted in the +midst of their RCU read-side critical sections, and these tasks +must be tracked explicitly. +The details of exactly why and how they are tracked will be covered +in a separate article on RCU read-side processing. +For now, it is enough to know that the <tt>rcu_node</tt> +structure tracks them. + +<pre> + 1 struct list_head blkd_tasks; + 2 struct list_head *gp_tasks; + 3 struct list_head *exp_tasks; + 4 bool wait_blkd_tasks; +</pre> + +<p>The <tt>->blkd_tasks</tt> field is a list header for +the list of blocked and preempted tasks. +As tasks undergo context switches within RCU read-side critical +sections, their <tt>task_struct</tt> structures are enqueued +(via the <tt>task_struct</tt>'s <tt>->rcu_node_entry</tt> +field) onto the head of the <tt>->blkd_tasks</tt> list for the +leaf <tt>rcu_node</tt> structure corresponding to the CPU +on which the outgoing context switch executed. +As these tasks later exit their RCU read-side critical sections, +they remove themselves from the list. +This list is therefore in reverse time order, so that if one of the tasks +is blocking the current grace period, all subsequent tasks must +also be blocking that same grace period. +Therefore, a single pointer into this list suffices to track +all tasks blocking a given grace period. +That pointer is stored in <tt>->gp_tasks</tt> for normal +grace periods and in <tt>->exp_tasks</tt> for expedited +grace periods. +These last two fields are <tt>NULL</tt> if either there is +no grace period in flight or if there are no blocked tasks +preventing that grace period from completing. +If either of these two pointers is referencing a task that +removes itself from the <tt>->blkd_tasks</tt> list, +then that task must advance the pointer to the next task on +the list, or set the pointer to <tt>NULL</tt> if there +are no subsequent tasks on the list. + +</p><p>For example, suppose that tasks T1, T2, and T3 are +all hard-affinitied to the largest-numbered CPU in the system. +Then if task T1 blocked in an RCU read-side +critical section, then an expedited grace period started, +then task T2 blocked in an RCU read-side critical section, +then a normal grace period started, and finally task 3 blocked +in an RCU read-side critical section, then the state of the +last leaf <tt>rcu_node</tt> structure's blocked-task list +would be as shown below: + +</p><p><img src="blkd_task.svg" alt="blkd_task.svg" width="60%"> + +</p><p>Task T1 is blocking both grace periods, task T2 is +blocking only the normal grace period, and task T3 is blocking +neither grace period. +Note that these tasks will not remove themselves from this list +immediately upon resuming execution. +They will instead remain on the list until they execute the outermost +<tt>rcu_read_unlock()</tt> that ends their RCU read-side critical +section. + +<p> +The <tt>->wait_blkd_tasks</tt> field indicates whether or not +the current grace period is waiting on a blocked task. + +<h5>Sizing the <tt>rcu_node</tt> Array</h5> + +<p>The <tt>rcu_node</tt> array is sized via a series of +C-preprocessor expressions as follows: + +<pre> + 1 #ifdef CONFIG_RCU_FANOUT + 2 #define RCU_FANOUT CONFIG_RCU_FANOUT + 3 #else + 4 # ifdef CONFIG_64BIT + 5 # define RCU_FANOUT 64 + 6 # else + 7 # define RCU_FANOUT 32 + 8 # endif + 9 #endif +10 +11 #ifdef CONFIG_RCU_FANOUT_LEAF +12 #define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF +13 #else +14 # ifdef CONFIG_64BIT +15 # define RCU_FANOUT_LEAF 64 +16 # else +17 # define RCU_FANOUT_LEAF 32 +18 # endif +19 #endif +20 +21 #define RCU_FANOUT_1 (RCU_FANOUT_LEAF) +22 #define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT) +23 #define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT) +24 #define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT) +25 +26 #if NR_CPUS <= RCU_FANOUT_1 +27 # define RCU_NUM_LVLS 1 +28 # define NUM_RCU_LVL_0 1 +29 # define NUM_RCU_NODES NUM_RCU_LVL_0 +30 # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } +31 # define RCU_NODE_NAME_INIT { "rcu_node_0" } +32 # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } +33 # define RCU_EXP_NAME_INIT { "rcu_node_exp_0" } +34 #elif NR_CPUS <= RCU_FANOUT_2 +35 # define RCU_NUM_LVLS 2 +36 # define NUM_RCU_LVL_0 1 +37 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) +38 # define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1) +39 # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } +40 # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } +41 # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } +42 # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" } +43 #elif NR_CPUS <= RCU_FANOUT_3 +44 # define RCU_NUM_LVLS 3 +45 # define NUM_RCU_LVL_0 1 +46 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) +47 # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) +48 # define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2) +49 # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } +50 # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } +51 # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } +52 # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" } +53 #elif NR_CPUS <= RCU_FANOUT_4 +54 # define RCU_NUM_LVLS 4 +55 # define NUM_RCU_LVL_0 1 +56 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) +57 # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) +58 # define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) +59 # define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) +60 # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } +61 # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } +62 # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } +63 # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" } +64 #else +65 # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" +66 #endif +</pre> + +<p>The maximum number of levels in the <tt>rcu_node</tt> structure +is currently limited to four, as specified by lines 21-24 +and the structure of the subsequent “if” statement. +For 32-bit systems, this allows 16*32*32*32=524,288 CPUs, which +should be sufficient for the next few years at least. +For 64-bit systems, 16*64*64*64=4,194,304 CPUs is allowed, which +should see us through the next decade or so. +This four-level tree also allows kernels built with +<tt>CONFIG_RCU_FANOUT=8</tt> to support up to 4096 CPUs, +which might be useful in very large systems having eight CPUs per +socket (but please note that no one has yet shown any measurable +performance degradation due to misaligned socket and <tt>rcu_node</tt> +boundaries). +In addition, building kernels with a full four levels of <tt>rcu_node</tt> +tree permits better testing of RCU's combining-tree code. + +</p><p>The <tt>RCU_FANOUT</tt> symbol controls how many children +are permitted at each non-leaf level of the <tt>rcu_node</tt> tree. +If the <tt>CONFIG_RCU_FANOUT</tt> Kconfig option is not specified, +it is set based on the word size of the system, which is also +the Kconfig default. + +</p><p>The <tt>RCU_FANOUT_LEAF</tt> symbol controls how many CPUs are +handled by each leaf <tt>rcu_node</tt> structure. +Experience has shown that allowing a given leaf <tt>rcu_node</tt> +structure to handle 64 CPUs, as permitted by the number of bits in +the <tt>->qsmask</tt> field on a 64-bit system, results in +excessive contention for the leaf <tt>rcu_node</tt> structures' +<tt>->lock</tt> fields. +The number of CPUs per leaf <tt>rcu_node</tt> structure is therefore +limited to 16 given the default value of <tt>CONFIG_RCU_FANOUT_LEAF</tt>. +If <tt>CONFIG_RCU_FANOUT_LEAF</tt> is unspecified, the value +selected is based on the word size of the system, just as for +<tt>CONFIG_RCU_FANOUT</tt>. +Lines 11-19 perform this computation. + +</p><p>Lines 21-24 compute the maximum number of CPUs supported by +a single-level (which contains a single <tt>rcu_node</tt> structure), +two-level, three-level, and four-level <tt>rcu_node</tt> tree, +respectively, given the fanout specified by <tt>RCU_FANOUT</tt> +and <tt>RCU_FANOUT_LEAF</tt>. +These numbers of CPUs are retained in the +<tt>RCU_FANOUT_1</tt>, +<tt>RCU_FANOUT_2</tt>, +<tt>RCU_FANOUT_3</tt>, and +<tt>RCU_FANOUT_4</tt> +C-preprocessor variables, respectively. + +</p><p>These variables are used to control the C-preprocessor <tt>#if</tt> +statement spanning lines 26-66 that computes the number of +<tt>rcu_node</tt> structures required for each level of the tree, +as well as the number of levels required. +The number of levels is placed in the <tt>NUM_RCU_LVLS</tt> +C-preprocessor variable by lines 27, 35, 44, and 54. +The number of <tt>rcu_node</tt> structures for the topmost level +of the tree is always exactly one, and this value is unconditionally +placed into <tt>NUM_RCU_LVL_0</tt> by lines 28, 36, 45, and 55. +The rest of the levels (if any) of the <tt>rcu_node</tt> tree +are computed by dividing the maximum number of CPUs by the +fanout supported by the number of levels from the current level down, +rounding up. This computation is performed by lines 37, +46-47, and 56-58. +Lines 31-33, 40-42, 50-52, and 62-63 create initializers +for lockdep lock-class names. +Finally, lines 64-66 produce an error if the maximum number of +CPUs is too large for the specified fanout. + +<h3><a name="The rcu_data Structure"> +The <tt>rcu_data</tt> Structure</a></h3> + +<p>The <tt>rcu_data</tt> maintains the per-CPU state for the +corresponding flavor of RCU. +The fields in this structure may be accessed only from the corresponding +CPU (and from tracing) unless otherwise stated. +This structure is the +focus of quiescent-state detection and RCU callback queuing. +It also tracks its relationship to the corresponding leaf +<tt>rcu_node</tt> structure to allow more-efficient +propagation of quiescent states up the <tt>rcu_node</tt> +combining tree. +Like the <tt>rcu_node</tt> structure, it provides a local +copy of the grace-period information to allow for-free +synchronized +access to this information from the corresponding CPU. +Finally, this structure records past dyntick-idle state +for the corresponding CPU and also tracks statistics. + +</p><p>The <tt>rcu_data</tt> structure's fields are discussed, +singly and in groups, in the following sections. + +<h5>Connection to Other Data Structures</h5> + +<p>This portion of the <tt>rcu_data</tt> structure is declared +as follows: + +<pre> + 1 int cpu; + 2 struct rcu_state *rsp; + 3 struct rcu_node *mynode; + 4 struct rcu_dynticks *dynticks; + 5 unsigned long grpmask; + 6 bool beenonline; +</pre> + +<p>The <tt>->cpu</tt> field contains the number of the +corresponding CPU, the <tt>->rsp</tt> pointer references +the corresponding <tt>rcu_state</tt> structure (and is most frequently +used to locate the name of the corresponding flavor of RCU for tracing), +and the <tt>->mynode</tt> field references the corresponding +<tt>rcu_node</tt> structure. +The <tt>->mynode</tt> is used to propagate quiescent states +up the combining tree. +<p>The <tt>->dynticks</tt> pointer references the +<tt>rcu_dynticks</tt> structure corresponding to this +CPU. +Recall that a single per-CPU instance of the <tt>rcu_dynticks</tt> +structure is shared among all flavors of RCU. +These first four fields are constant and therefore require not +synchronization. + +</p><p>The <tt>->grpmask</tt> field indicates the bit in +the <tt>->mynode->qsmask</tt> corresponding to this +<tt>rcu_data</tt> structure, and is also used when propagating +quiescent states. +The <tt>->beenonline</tt> flag is set whenever the corresponding +CPU comes online, which means that the debugfs tracing need not dump +out any <tt>rcu_data</tt> structure for which this flag is not set. + +<h5>Quiescent-State and Grace-Period Tracking</h5> + +<p>This portion of the <tt>rcu_data</tt> structure is declared +as follows: + +<pre> + 1 unsigned long completed; + 2 unsigned long gpnum; + 3 bool cpu_no_qs; + 4 bool core_needs_qs; + 5 bool gpwrap; + 6 unsigned long rcu_qs_ctr_snap; +</pre> + +<p>The <tt>completed</tt> and <tt>gpnum</tt> +fields are the counterparts of the fields of the same name +in the <tt>rcu_state</tt> and <tt>rcu_node</tt> structures. +They may each lag up to one behind their <tt>rcu_node</tt> +counterparts, but in <tt>CONFIG_NO_HZ_IDLE</tt> and +<tt>CONFIG_NO_HZ_FULL</tt> kernels can lag +arbitrarily far behind for CPUs in dyntick-idle mode (but these counters +will catch up upon exit from dyntick-idle mode). +If a given <tt>rcu_data</tt> structure's <tt>->gpnum</tt> and +<tt>->complete</tt> fields are equal, then this <tt>rcu_data</tt> +structure believes that RCU is idle. +Otherwise, as with the <tt>rcu_state</tt> and <tt>rcu_node</tt> +structure, +the <tt>->gpnum</tt> field will be one greater than the +<tt>->complete</tt> fields, with <tt>->gpnum</tt> +indicating which grace period this <tt>rcu_data</tt> believes +is still being waited for. + +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + All this replication of the grace period numbers can only cause + massive confusion. + Why not just keep a global pair of counters and be done with it??? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + Because if there was only a single global pair of grace-period + numbers, there would need to be a single global lock to allow + safely accessing and updating them. + And if we are not going to have a single global lock, we need + to carefully manage the numbers on a per-node basis. + Recall from the answer to a previous Quick Quiz that the consequences + of applying a previously sampled quiescent state to the wrong + grace period are quite severe. +</font></td></tr> +<tr><td> </td></tr> +</table> + +<p>The <tt>->cpu_no_qs</tt> flag indicates that the +CPU has not yet passed through a quiescent state, +while the <tt>->core_needs_qs</tt> flag indicates that the +RCU core needs a quiescent state from the corresponding CPU. +The <tt>->gpwrap</tt> field indicates that the corresponding +CPU has remained idle for so long that the <tt>completed</tt> +and <tt>gpnum</tt> counters are in danger of overflow, which +will cause the CPU to disregard the values of its counters on +its next exit from idle. +Finally, the <tt>rcu_qs_ctr_snap</tt> field is used to detect +cases where a given operation has resulted in a quiescent state +for all flavors of RCU, for example, <tt>cond_resched_rcu_qs()</tt>. + +<h5>RCU Callback Handling</h5> + +<p>In the absence of CPU-hotplug events, RCU callbacks are invoked by +the same CPU that registered them. +This is strictly a cache-locality optimization: callbacks can and +do get invoked on CPUs other than the one that registered them. +After all, if the CPU that registered a given callback has gone +offline before the callback can be invoked, there really is no other +choice. + +</p><p>This portion of the <tt>rcu_data</tt> structure is declared +as follows: + +<pre> + 1 struct rcu_head *nxtlist; + 2 struct rcu_head **nxttail[RCU_NEXT_SIZE]; + 3 unsigned long nxtcompleted[RCU_NEXT_SIZE]; + 4 long qlen_lazy; + 5 long qlen; + 6 long qlen_last_fqs_check; + 7 unsigned long n_force_qs_snap; + 8 unsigned long n_cbs_invoked; + 9 unsigned long n_cbs_orphaned; +10 unsigned long n_cbs_adopted; +11 long blimit; +</pre> + +<p>The <tt>->nxtlist</tt> pointer and the +<tt>->nxttail[]</tt> array form a four-segment list with +older callbacks near the head and newer ones near the tail. +Each segment contains callbacks with the corresponding relationship +to the current grace period. +The pointer out of the end of each of the four segments is referenced +by the element of the <tt>->nxttail[]</tt> array indexed by +<tt>RCU_DONE_TAIL</tt> (for callbacks handled by a prior grace period), +<tt>RCU_WAIT_TAIL</tt> (for callbacks waiting on the current grace period), +<tt>RCU_NEXT_READY_TAIL</tt> (for callbacks that will wait on the next +grace period), and +<tt>RCU_NEXT_TAIL</tt> (for callbacks that are not yet associated +with a specific grace period) +respectively, as shown in the following figure. + +</p><p><img src="nxtlist.svg" alt="nxtlist.svg" width="40%"> + +</p><p>In this figure, the <tt>->nxtlist</tt> pointer references the +first +RCU callback in the list. +The <tt>->nxttail[RCU_DONE_TAIL]</tt> array element references +the <tt>->nxtlist</tt> pointer itself, indicating that none +of the callbacks is ready to invoke. +The <tt>->nxttail[RCU_WAIT_TAIL]</tt> array element references callback +CB 2's <tt>->next</tt> pointer, which indicates that +CB 1 and CB 2 are both waiting on the current grace period. +The <tt>->nxttail[RCU_NEXT_READY_TAIL]</tt> array element +references the same RCU callback that <tt>->nxttail[RCU_WAIT_TAIL]</tt> +does, which indicates that there are no callbacks waiting on the next +RCU grace period. +The <tt>->nxttail[RCU_NEXT_TAIL]</tt> array element references +CB 4's <tt>->next</tt> pointer, indicating that all the +remaining RCU callbacks have not yet been assigned to an RCU grace +period. +Note that the <tt>->nxttail[RCU_NEXT_TAIL]</tt> array element +always references the last RCU callback's <tt>->next</tt> pointer +unless the callback list is empty, in which case it references +the <tt>->nxtlist</tt> pointer. + +</p><p>CPUs advance their callbacks from the +<tt>RCU_NEXT_TAIL</tt> to the <tt>RCU_NEXT_READY_TAIL</tt> to the +<tt>RCU_WAIT_TAIL</tt> to the <tt>RCU_DONE_TAIL</tt> list segments +as grace periods advance. +The CPU advances the callbacks in its <tt>rcu_data</tt> structure +whenever it notices that another RCU grace period has completed. +The CPU detects the completion of an RCU grace period by noticing +that the value of its <tt>rcu_data</tt> structure's +<tt>->completed</tt> field differs from that of its leaf +<tt>rcu_node</tt> structure. +Recall that each <tt>rcu_node</tt> structure's +<tt>->completed</tt> field is updated at the end of each +grace period. + +</p><p>The <tt>->nxtcompleted[]</tt> array records grace-period +numbers corresponding to the list segments. +This allows CPUs that go idle for extended periods to determine +which of their callbacks are ready to be invoked after reawakening. + +</p><p>The <tt>->qlen</tt> counter contains the number of +callbacks in <tt>->nxtlist</tt>, and the +<tt>->qlen_lazy</tt> contains the number of those callbacks that +are known to only free memory, and whose invocation can therefore +be safely deferred. +The <tt>->qlen_last_fqs_check</tt> and +<tt>->n_force_qs_snap</tt> coordinate the forcing of quiescent +states from <tt>call_rcu()</tt> and friends when callback +lists grow excessively long. + +</p><p>The <tt>->n_cbs_invoked</tt>, +<tt>->n_cbs_orphaned</tt>, and <tt>->n_cbs_adopted</tt> +fields count the number of callbacks invoked, +sent to other CPUs when this CPU goes offline, +and received from other CPUs when those other CPUs go offline. +Finally, the <tt>->blimit</tt> counter is the maximum number of +RCU callbacks that may be invoked at a given time. + +<h5>Dyntick-Idle Handling</h5> + +<p>This portion of the <tt>rcu_data</tt> structure is declared +as follows: + +<pre> + 1 int dynticks_snap; + 2 unsigned long dynticks_fqs; +</pre> + +The <tt>->dynticks_snap</tt> field is used to take a snapshot +of the corresponding CPU's dyntick-idle state when forcing +quiescent states, and is therefore accessed from other CPUs. +Finally, the <tt>->dynticks_fqs</tt> field is used to +count the number of times this CPU is determined to be in +dyntick-idle state, and is used for tracing and debugging purposes. + +<h3><a name="The rcu_dynticks Structure"> +The <tt>rcu_dynticks</tt> Structure</a></h3> + +<p>The <tt>rcu_dynticks</tt> maintains the per-CPU dyntick-idle state +for the corresponding CPU. +Unlike the other structures, <tt>rcu_dynticks</tt> is not +replicated over the different flavors of RCU. +The fields in this structure may be accessed only from the corresponding +CPU (and from tracing) unless otherwise stated. +Its fields are as follows: + +<pre> + 1 int dynticks_nesting; + 2 int dynticks_nmi_nesting; + 3 atomic_t dynticks; +</pre> + +<p>The <tt>->dynticks_nesting</tt> field counts the +nesting depth of normal interrupts. +In addition, this counter is incremented when exiting dyntick-idle +mode and decremented when entering it. +This counter can therefore be thought of as counting the number +of reasons why this CPU cannot be permitted to enter dyntick-idle +mode, aside from non-maskable interrupts (NMIs). +NMIs are counted by the <tt>->dynticks_nmi_nesting</tt> +field, except that NMIs that interrupt non-dyntick-idle execution +are not counted. + +</p><p>Finally, the <tt>->dynticks</tt> field counts the corresponding +CPU's transitions to and from dyntick-idle mode, so that this counter +has an even value when the CPU is in dyntick-idle mode and an odd +value otherwise. + +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + Why not just count all NMIs? + Wouldn't that be simpler and less error prone? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + It seems simpler only until you think hard about how to go about + updating the <tt>rcu_dynticks</tt> structure's + <tt>->dynticks</tt> field. +</font></td></tr> +<tr><td> </td></tr> +</table> + +<p>Additional fields are present for some special-purpose +builds, and are discussed separately. + +<h3><a name="The rcu_head Structure"> +The <tt>rcu_head</tt> Structure</a></h3> + +<p>Each <tt>rcu_head</tt> structure represents an RCU callback. +These structures are normally embedded within RCU-protected data +structures whose algorithms use asynchronous grace periods. +In contrast, when using algorithms that block waiting for RCU grace periods, +RCU users need not provide <tt>rcu_head</tt> structures. + +</p><p>The <tt>rcu_head</tt> structure has fields as follows: + +<pre> + 1 struct rcu_head *next; + 2 void (*func)(struct rcu_head *head); +</pre> + +<p>The <tt>->next</tt> field is used +to link the <tt>rcu_head</tt> structures together in the +lists within the <tt>rcu_data</tt> structures. +The <tt>->func</tt> field is a pointer to the function +to be called when the callback is ready to be invoked, and +this function is passed a pointer to the <tt>rcu_head</tt> +structure. +However, <tt>kfree_rcu()</tt> uses the <tt>->func</tt> +field to record the offset of the <tt>rcu_head</tt> +structure within the enclosing RCU-protected data structure. + +</p><p>Both of these fields are used internally by RCU. +From the viewpoint of RCU users, this structure is an +opaque “cookie”. + +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + Given that the callback function <tt>->func</tt> + is passed a pointer to the <tt>rcu_head</tt> structure, + how is that function supposed to find the beginning of the + enclosing RCU-protected data structure? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + In actual practice, there is a separate callback function per + type of RCU-protected data structure. + The callback function can therefore use the <tt>container_of()</tt> + macro in the Linux kernel (or other pointer-manipulation facilities + in other software environments) to find the beginning of the + enclosing structure. +</font></td></tr> +<tr><td> </td></tr> +</table> + +<h3><a name="RCU-Specific Fields in the task_struct Structure"> +RCU-Specific Fields in the <tt>task_struct</tt> Structure</a></h3> + +<p>The <tt>CONFIG_PREEMPT_RCU</tt> implementation uses some +additional fields in the <tt>task_struct</tt> structure: + +<pre> + 1 #ifdef CONFIG_PREEMPT_RCU + 2 int rcu_read_lock_nesting; + 3 union rcu_special rcu_read_unlock_special; + 4 struct list_head rcu_node_entry; + 5 struct rcu_node *rcu_blocked_node; + 6 #endif /* #ifdef CONFIG_PREEMPT_RCU */ + 7 #ifdef CONFIG_TASKS_RCU + 8 unsigned long rcu_tasks_nvcsw; + 9 bool rcu_tasks_holdout; +10 struct list_head rcu_tasks_holdout_list; +11 int rcu_tasks_idle_cpu; +12 #endif /* #ifdef CONFIG_TASKS_RCU */ +</pre> + +<p>The <tt>->rcu_read_lock_nesting</tt> field records the +nesting level for RCU read-side critical sections, and +the <tt>->rcu_read_unlock_special</tt> field is a bitmask +that records special conditions that require <tt>rcu_read_unlock()</tt> +to do additional work. +The <tt>->rcu_node_entry</tt> field is used to form lists of +tasks that have blocked within preemptible-RCU read-side critical +sections and the <tt>->rcu_blocked_node</tt> field references +the <tt>rcu_node</tt> structure whose list this task is a member of, +or <tt>NULL</tt> if it is not blocked within a preemptible-RCU +read-side critical section. + +<p>The <tt>->rcu_tasks_nvcsw</tt> field tracks the number of +voluntary context switches that this task had undergone at the +beginning of the current tasks-RCU grace period, +<tt>->rcu_tasks_holdout</tt> is set if the current tasks-RCU +grace period is waiting on this task, <tt>->rcu_tasks_holdout_list</tt> +is a list element enqueuing this task on the holdout list, +and <tt>->rcu_tasks_idle_cpu</tt> tracks which CPU this +idle task is running, but only if the task is currently running, +that is, if the CPU is currently idle. + +<h3><a name="Accessor Functions"> +Accessor Functions</a></h3> + +<p>The following listing shows the +<tt>rcu_get_root()</tt>, <tt>rcu_for_each_node_breadth_first</tt>, +<tt>rcu_for_each_nonleaf_node_breadth_first()</tt>, and +<tt>rcu_for_each_leaf_node()</tt> function and macros: + +<pre> + 1 static struct rcu_node *rcu_get_root(struct rcu_state *rsp) + 2 { + 3 return &rsp->node[0]; + 4 } + 5 + 6 #define rcu_for_each_node_breadth_first(rsp, rnp) \ + 7 for ((rnp) = &(rsp)->node[0]; \ + 8 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) + 9 + 10 #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ + 11 for ((rnp) = &(rsp)->node[0]; \ + 12 (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++) + 13 + 14 #define rcu_for_each_leaf_node(rsp, rnp) \ + 15 for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \ + 16 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) +</pre> + +<p>The <tt>rcu_get_root()</tt> simply returns a pointer to the +first element of the specified <tt>rcu_state</tt> structure's +<tt>->node[]</tt> array, which is the root <tt>rcu_node</tt> +structure. + +</p><p>As noted earlier, the <tt>rcu_for_each_node_breadth_first()</tt> +macro takes advantage of the layout of the <tt>rcu_node</tt> +structures in the <tt>rcu_state</tt> structure's +<tt>->node[]</tt> array, performing a breadth-first traversal by +simply traversing the array in order. +The <tt>rcu_for_each_nonleaf_node_breadth_first()</tt> macro operates +similarly, but traverses only the first part of the array, thus excluding +the leaf <tt>rcu_node</tt> structures. +Finally, the <tt>rcu_for_each_leaf_node()</tt> macro traverses only +the last part of the array, thus traversing only the leaf +<tt>rcu_node</tt> structures. + +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + What do <tt>rcu_for_each_nonleaf_node_breadth_first()</tt> and + <tt>rcu_for_each_leaf_node()</tt> do if the <tt>rcu_node</tt> tree + contains only a single node? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + In the single-node case, + <tt>rcu_for_each_nonleaf_node_breadth_first()</tt> is a no-op + and <tt>rcu_for_each_leaf_node()</tt> traverses the single node. +</font></td></tr> +<tr><td> </td></tr> +</table> + +<h3><a name="Summary"> +Summary</a></h3> + +So each flavor of RCU is represented by an <tt>rcu_state</tt> structure, +which contains a combining tree of <tt>rcu_node</tt> and +<tt>rcu_data</tt> structures. +Finally, in <tt>CONFIG_NO_HZ_IDLE</tt> kernels, each CPU's dyntick-idle +state is tracked by an <tt>rcu_dynticks</tt> structure. + +If you made it this far, you are well prepared to read the code +walkthroughs in the other articles in this series. + +<h3><a name="Acknowledgments"> +Acknowledgments</a></h3> + +I owe thanks to Cyrill Gorcunov, Mathieu Desnoyers, Dhaval Giani, Paul +Turner, Abhishek Srivastava, Matt Kowalczyk, and Serge Hallyn +for helping me get this document into a more human-readable state. + +<h3><a name="Legal Statement"> +Legal Statement</a></h3> + +<p>This work represents the view of the author and does not necessarily +represent the view of IBM. + +</p><p>Linux is a registered trademark of Linus Torvalds. + +</p><p>Other company, product, and service names may be trademarks or +service marks of others. + +</body></html> diff --git a/Documentation/RCU/Design/Data-Structures/HugeTreeClassicRCU.svg b/Documentation/RCU/Design/Data-Structures/HugeTreeClassicRCU.svg new file mode 100644 index 000000000000..2bf12b468206 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/HugeTreeClassicRCU.svg @@ -0,0 +1,939 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Creator: fig2dev Version 3.2 Patchlevel 5e --> + +<!-- CreationDate: Wed Dec 9 17:37:22 2015 --> + +<!-- Magnification: 3.000 --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="15.1in" + height="11.2in" + viewBox="-66 -66 18087 13407" + id="svg2" + version="1.1" + inkscape:version="0.48.4 r9939" + sodipodi:docname="HugeTreeClassicRCU.fig"> + <metadata + id="metadata224"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title></dc:title> + </cc:Work> + </rdf:RDF> + </metadata> + <defs + id="defs222"> + <marker + inkscape:stockid="Arrow1Mend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow1Mend" + style="overflow:visible;"> + <path + id="path3982" + d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z " + style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;" + transform="scale(0.4) rotate(180) translate(10,0)" /> + </marker> + </defs> + <sodipodi:namedview + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1" + objecttolerance="10" + gridtolerance="10" + guidetolerance="10" + inkscape:pageopacity="0" + inkscape:pageshadow="2" + inkscape:window-width="1134" + inkscape:window-height="789" + id="namedview220" + showgrid="false" + inkscape:zoom="0.60515873" + inkscape:cx="679.5" + inkscape:cy="504" + inkscape:window-x="786" + inkscape:window-y="24" + inkscape:window-maximized="0" + inkscape:current-layer="g4" /> + <g + style="stroke-width:.025in; fill:none" + id="g4"> + <!-- Line: box --> + <rect + x="450" + y="0" + width="17100" + height="8325" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect6" /> + <!-- Line: box --> + <rect + x="11025" + y="3600" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect8" /> + <!-- Line: box --> + <rect + x="4275" + y="3600" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect10" /> + <!-- Line: box --> + <rect + x="5400" + y="6300" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect12" /> + <!-- Line: box --> + <rect + x="9900" + y="6300" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect14" /> + <!-- Line: box --> + <rect + x="14400" + y="6300" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect16" /> + <!-- Line: box --> + <rect + x="900" + y="6300" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect18" /> + <!-- Line: box --> + <rect + x="7650" + y="900" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect20" /> + <!-- Line --> + <polyline + points="3150,9225 3150,7746 " + style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline22" /> + <!-- Arrowhead on XXXpoint 3150 9225 - 3150 7560--> + <!-- Circle --> + <circle + cx="8550" + cy="4275" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle26" /> + <!-- Circle --> + <circle + cx="9000" + cy="4275" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle28" /> + <!-- Circle --> + <circle + cx="9450" + cy="4275" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle30" /> + <!-- Line --> + <polyline + points="6750,6300 8250,5010 " + style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline32" /> + <!-- Arrowhead on XXXpoint 6750 6300 - 8391 4890--> + <!-- Line --> + <polyline + points="11250,6300 9747,5010 " + style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline36" /> + <!-- Arrowhead on XXXpoint 11250 6300 - 9606 4890--> + <!-- Circle --> + <circle + cx="13950" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle40" /> + <!-- Circle --> + <circle + cx="13500" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle42" /> + <!-- Circle --> + <circle + cx="13050" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle44" /> + <!-- Circle --> + <circle + cx="9450" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle46" /> + <!-- Circle --> + <circle + cx="9000" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle48" /> + <!-- Circle --> + <circle + cx="8550" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle50" /> + <!-- Circle --> + <circle + cx="4950" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle52" /> + <!-- Circle --> + <circle + cx="4500" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle54" /> + <!-- Circle --> + <circle + cx="4050" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle56" /> + <!-- Circle --> + <circle + cx="1800" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle58" /> + <!-- Circle --> + <circle + cx="2250" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle60" /> + <!-- Circle --> + <circle + cx="2700" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle62" /> + <!-- Circle --> + <circle + cx="15300" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle64" /> + <!-- Circle --> + <circle + cx="15750" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle66" /> + <!-- Circle --> + <circle + cx="16200" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle68" /> + <!-- Circle --> + <circle + cx="10800" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle70" /> + <!-- Circle --> + <circle + cx="11250" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle72" /> + <!-- Circle --> + <circle + cx="11700" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle74" /> + <!-- Circle --> + <circle + cx="6300" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle76" /> + <!-- Circle --> + <circle + cx="6750" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle78" /> + <!-- Circle --> + <circle + cx="7200" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle80" /> + <!-- Line: box --> + <rect + x="0" + y="11475" + width="2700" + height="1800" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect82" /> + <!-- Line: box --> + <rect + x="1800" + y="9225" + width="2700" + height="1800" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect84" /> + <!-- Line: box --> + <rect + x="4500" + y="11475" + width="2700" + height="1800" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect86" /> + <!-- Line: box --> + <rect + x="6300" + y="9270" + width="2700" + height="1800" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect88" /> + <!-- Line: box --> + <rect + x="8955" + y="11475" + width="2700" + height="1800" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect90" /> + <!-- Line: box --> + <rect + x="10755" + y="9270" + width="2700" + height="1800" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect92" /> + <!-- Line: box --> + <rect + x="13455" + y="11475" + width="2700" + height="1800" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect94" /> + <!-- Line: box --> + <rect + x="15255" + y="9270" + width="2700" + height="1800" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect96" /> + <!-- Line --> + <polyline + points="11700,3600 10197,2310 " + style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline98" /> + <!-- Arrowhead on XXXpoint 11700 3600 - 10056 2190--> + <!-- Line --> + <polyline + points="6300,3600 7800,2310 " + style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline102" /> + <!-- Arrowhead on XXXpoint 6300 3600 - 7941 2190--> + <!-- Line --> + <polyline + points="3150,6300 4650,5010 " + style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline106" /> + <!-- Arrowhead on XXXpoint 3150 6300 - 4791 4890--> + <!-- Line --> + <polyline + points="14850,6300 13347,5010 " + style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline110" /> + <!-- Arrowhead on XXXpoint 14850 6300 - 13206 4890--> + <!-- Line --> + <polyline + points="1350,11475 1350,7746 " + style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline114" /> + <!-- Arrowhead on XXXpoint 1350 11475 - 1350 7560--> + <!-- Line --> + <polyline + points="16650,9225 16650,7746 " + style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline118" /> + <!-- Arrowhead on XXXpoint 16650 9225 - 16650 7560--> + <!-- Line --> + <polyline + points="14850,11475 14850,7746 " + style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline122" /> + <!-- Arrowhead on XXXpoint 14850 11475 - 14850 7560--> + <!-- Line --> + <polyline + points="12150,9225 12150,7746 " + style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline126" /> + <!-- Arrowhead on XXXpoint 12150 9225 - 12150 7560--> + <!-- Line --> + <polyline + points="10350,11475 10350,7746 " + style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline130" /> + <!-- Arrowhead on XXXpoint 10350 11475 - 10350 7560--> + <!-- Line --> + <polyline + points="7650,9225 7650,7746 " + style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline134" /> + <!-- Arrowhead on XXXpoint 7650 9225 - 7650 7560--> + <!-- Line --> + <polyline + points="5850,11475 5850,7746 " + style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline138" /> + <!-- Arrowhead on XXXpoint 5850 11475 - 5850 7560--> + <!-- Text --> + <text + xml:space="preserve" + x="12375" + y="4500" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text142">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="12375" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text144">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5625" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text146">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5625" + y="4500" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text148">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="6750" + y="6750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text150">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="6750" + y="7200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text152">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="11250" + y="7200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text154">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="11250" + y="6750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text156">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="15750" + y="7200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text158">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="15750" + y="6750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text160">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2250" + y="6750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text162">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2250" + y="7200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text164">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="1350" + y="13050" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text166">CPU 0</text> + <!-- Text --> + <text + xml:space="preserve" + x="1350" + y="11925" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text168">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1350" + y="12375" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text170">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="3150" + y="10800" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text172">CPU 15</text> + <!-- Text --> + <text + xml:space="preserve" + x="3150" + y="9675" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text174">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="3150" + y="10125" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text176">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="5850" + y="11925" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text178">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5850" + y="12375" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text180">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="5850" + y="13050" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text182">CPU 21823</text> + <!-- Text --> + <text + xml:space="preserve" + x="7650" + y="10845" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text184">CPU 21839</text> + <!-- Text --> + <text + xml:space="preserve" + x="7650" + y="10170" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text186">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="7650" + y="9720" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text188">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="10305" + y="11925" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text190">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="10305" + y="12375" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text192">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="10305" + y="13050" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text194">CPU 43679</text> + <!-- Text --> + <text + xml:space="preserve" + x="12105" + y="10845" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text196">CPU 43695</text> + <!-- Text --> + <text + xml:space="preserve" + x="12105" + y="10170" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text198">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="12105" + y="9720" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text200">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="14805" + y="11925" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text202">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="14805" + y="12375" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text204">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="14805" + y="13050" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text206">CPU 65519</text> + <!-- Text --> + <text + xml:space="preserve" + x="16605" + y="10845" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text208">CPU 65535</text> + <!-- Text --> + <text + xml:space="preserve" + x="16605" + y="10170" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text210">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="16605" + y="9720" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text212">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="675" + y="450" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="start" + id="text214">struct rcu_state</text> + <!-- Text --> + <text + xml:space="preserve" + x="9000" + y="1350" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text216">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="9000" + y="1800" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text218">rcu_node</text> + </g> +</svg> diff --git a/Documentation/RCU/Design/Data-Structures/TreeLevel.svg b/Documentation/RCU/Design/Data-Structures/TreeLevel.svg new file mode 100644 index 000000000000..7a7eb3bac95c --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/TreeLevel.svg @@ -0,0 +1,828 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Creator: fig2dev Version 3.2 Patchlevel 5e --> + +<!-- CreationDate: Wed Dec 9 17:41:29 2015 --> + +<!-- Magnification: 3.000 --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="17.7in" + height="10.4in" + viewBox="-66 -66 21237 12507" + id="svg2" + version="1.1" + inkscape:version="0.48.4 r9939" + sodipodi:docname="TreeLevel.fig"> + <metadata + id="metadata216"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title></dc:title> + </cc:Work> + </rdf:RDF> + </metadata> + <defs + id="defs214"> + <marker + inkscape:stockid="Arrow1Mend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow1Mend" + style="overflow:visible;"> + <path + id="path3974" + d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z " + style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;" + transform="scale(0.4) rotate(180) translate(10,0)" /> + </marker> + </defs> + <sodipodi:namedview + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1" + objecttolerance="10" + gridtolerance="10" + guidetolerance="10" + inkscape:pageopacity="0" + inkscape:pageshadow="2" + inkscape:window-width="1023" + inkscape:window-height="1148" + id="namedview212" + showgrid="false" + inkscape:zoom="0.55869424" + inkscape:cx="796.50006" + inkscape:cy="467.99997" + inkscape:window-x="897" + inkscape:window-y="24" + inkscape:window-maximized="0" + inkscape:current-layer="g4" /> + <g + style="stroke-width:.025in; fill:none" + id="g4"> + <!-- Line: box --> + <rect + x="0" + y="0" + width="20655" + height="8325" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect6" /> + <!-- Line: box --> + <rect + x="14130" + y="3600" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect8" /> + <!-- Line: box --> + <rect + x="7380" + y="3600" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect10" /> + <!-- Line: box --> + <rect + x="8505" + y="6300" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect12" /> + <!-- Line: box --> + <rect + x="13005" + y="6300" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect14" /> + <!-- Line: box --> + <rect + x="17505" + y="6300" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect16" /> + <!-- Line: box --> + <rect + x="4005" + y="6300" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect18" /> + <!-- Line: box --> + <rect + x="10755" + y="900" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect20" /> + <!-- Line --> + <polyline + points="6255,9225 6255,7746 " + style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline22" /> + <!-- Arrowhead on XXXpoint 6255 9225 - 6255 7560--> + <!-- Circle --> + <circle + cx="11655" + cy="4275" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle26" /> + <!-- Circle --> + <circle + cx="12105" + cy="4275" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle28" /> + <!-- Circle --> + <circle + cx="12555" + cy="4275" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle30" /> + <!-- Line --> + <polyline + points="9855,6300 11355,5010 " + style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline32" /> + <!-- Arrowhead on XXXpoint 9855 6300 - 11496 4890--> + <!-- Line --> + <polyline + points="14355,6300 12852,5010 " + style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline36" /> + <!-- Arrowhead on XXXpoint 14355 6300 - 12711 4890--> + <!-- Circle --> + <circle + cx="17055" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle40" /> + <!-- Circle --> + <circle + cx="16605" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle42" /> + <!-- Circle --> + <circle + cx="16155" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle44" /> + <!-- Circle --> + <circle + cx="12555" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle46" /> + <!-- Circle --> + <circle + cx="12105" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle48" /> + <!-- Circle --> + <circle + cx="11655" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle50" /> + <!-- Circle --> + <circle + cx="8055" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle52" /> + <!-- Circle --> + <circle + cx="7605" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle54" /> + <!-- Circle --> + <circle + cx="7155" + cy="6975" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle56" /> + <!-- Circle --> + <circle + cx="4905" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle58" /> + <!-- Circle --> + <circle + cx="5355" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle60" /> + <!-- Circle --> + <circle + cx="5805" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle62" /> + <!-- Circle --> + <circle + cx="18405" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle64" /> + <!-- Circle --> + <circle + cx="18855" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle66" /> + <!-- Circle --> + <circle + cx="19305" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle68" /> + <!-- Circle --> + <circle + cx="13905" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle70" /> + <!-- Circle --> + <circle + cx="14355" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle72" /> + <!-- Circle --> + <circle + cx="14805" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle74" /> + <!-- Circle --> + <circle + cx="9405" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle76" /> + <!-- Circle --> + <circle + cx="9855" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle78" /> + <!-- Circle --> + <circle + cx="10305" + cy="8775" + r="114" + style="fill:#000000;stroke:#000000;stroke-width:21;" + id="circle80" /> + <!-- Line: box --> + <rect + x="225" + y="1125" + width="3150" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:21; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect82" /> + <!-- Line: box --> + <rect + x="225" + y="2250" + width="3150" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:21; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect84" /> + <!-- Line: box --> + <rect + x="225" + y="3375" + width="3150" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:21; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect86" /> + <!-- Line --> + <polyline + points="14805,3600 13302,2310 " + style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline88" /> + <!-- Arrowhead on XXXpoint 14805 3600 - 13161 2190--> + <!-- Line --> + <polyline + points="9405,3600 10905,2310 " + style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline92" /> + <!-- Arrowhead on XXXpoint 9405 3600 - 11046 2190--> + <!-- Line --> + <polyline + points="6255,6300 7755,5010 " + style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline96" /> + <!-- Arrowhead on XXXpoint 6255 6300 - 7896 4890--> + <!-- Line --> + <polyline + points="17955,6300 16452,5010 " + style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline100" /> + <!-- Arrowhead on XXXpoint 17955 6300 - 16311 4890--> + <!-- Line --> + <polyline + points="4455,11025 4455,7746 " + style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline104" /> + <!-- Arrowhead on XXXpoint 4455 11025 - 4455 7560--> + <!-- Line --> + <polyline + points="19755,9225 19755,7746 " + style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline108" /> + <!-- Arrowhead on XXXpoint 19755 9225 - 19755 7560--> + <!-- Line --> + <polyline + points="17955,11025 17955,7746 " + style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline112" /> + <!-- Arrowhead on XXXpoint 17955 11025 - 17955 7560--> + <!-- Line --> + <polyline + points="15255,9225 15255,7746 " + style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline116" /> + <!-- Arrowhead on XXXpoint 15255 9225 - 15255 7560--> + <!-- Line --> + <polyline + points="13455,11025 13455,7746 " + style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline120" /> + <!-- Arrowhead on XXXpoint 13455 11025 - 13455 7560--> + <!-- Line --> + <polyline + points="10755,9225 10755,7746 " + style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline124" /> + <!-- Arrowhead on XXXpoint 10755 9225 - 10755 7560--> + <!-- Line --> + <polyline + points="8955,11025 8955,7746 " + style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline128" /> + <!-- Arrowhead on XXXpoint 8955 11025 - 8955 7560--> + <!-- Line: box --> + <rect + x="12105" + y="11025" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect132" /> + <!-- Line: box --> + <rect + x="13905" + y="9225" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect134" /> + <!-- Line: box --> + <rect + x="16605" + y="11025" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect136" /> + <!-- Line: box --> + <rect + x="18405" + y="9225" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect138" /> + <!-- Line: box --> + <rect + x="9405" + y="9225" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect140" /> + <!-- Line: box --> + <rect + x="7605" + y="11025" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect142" /> + <!-- Line: box --> + <rect + x="4905" + y="9225" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect144" /> + <!-- Line: box --> + <rect + x="3105" + y="11025" + width="2700" + height="1350" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect146" /> + <!-- Line --> + <polyline + points="3375,1575 10701,1575 " + style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline148" /> + <!-- Arrowhead on XXXpoint 3375 1575 - 10890 1575--> + <!-- Line --> + <polyline + points="3375,3825 4050,3825 4050,5400 2700,5400 2700,6975 3951,6975 " + style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline152" /> + <!-- Arrowhead on XXXpoint 2700 6975 - 4140 6975--> + <!-- Line --> + <polyline + points="3375,2700 5175,2700 5175,4275 7326,4275 " + style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline156" /> + <!-- Arrowhead on XXXpoint 5175 4275 - 7515 4275--> + <!-- Text --> + <text + xml:space="preserve" + x="15480" + y="4500" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text160">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="15480" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text162">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="8730" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text164">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="8730" + y="4500" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text166">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="9855" + y="6750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text168">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="9855" + y="7200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text170">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="14355" + y="7200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text172">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="14355" + y="6750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text174">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="18855" + y="7200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text176">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="18855" + y="6750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text178">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5355" + y="6750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text180">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5355" + y="7200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text182">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="450" + y="1800" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="324" + text-anchor="start" + id="text184">->level[0]</text> + <!-- Text --> + <text + xml:space="preserve" + x="450" + y="2925" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="324" + text-anchor="start" + id="text186">->level[1]</text> + <!-- Text --> + <text + xml:space="preserve" + x="450" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="324" + text-anchor="start" + id="text188">->level[2]</text> + <!-- Text --> + <text + xml:space="preserve" + x="12105" + y="1350" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text190">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="12105" + y="1800" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="middle" + id="text192">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="6255" + y="10125" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text194">CPU 15</text> + <!-- Text --> + <text + xml:space="preserve" + x="4455" + y="11925" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text196">CPU 0</text> + <!-- Text --> + <text + xml:space="preserve" + x="19755" + y="10125" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text198">CPU 65535</text> + <!-- Text --> + <text + xml:space="preserve" + x="17955" + y="11925" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text200">CPU 65519</text> + <!-- Text --> + <text + xml:space="preserve" + x="15255" + y="10125" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text202">CPU 43695</text> + <!-- Text --> + <text + xml:space="preserve" + x="13455" + y="11925" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text204">CPU 43679</text> + <!-- Text --> + <text + xml:space="preserve" + x="10755" + y="10125" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text206">CPU 21839</text> + <!-- Text --> + <text + xml:space="preserve" + x="8955" + y="11925" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text208">CPU 21823</text> + <!-- Text --> + <text + xml:space="preserve" + x="225" + y="450" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="288" + text-anchor="start" + id="text210">struct rcu_state</text> + </g> +</svg> diff --git a/Documentation/RCU/Design/Data-Structures/TreeMapping.svg b/Documentation/RCU/Design/Data-Structures/TreeMapping.svg new file mode 100644 index 000000000000..729cfa9e6cdb --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/TreeMapping.svg @@ -0,0 +1,305 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Creator: fig2dev Version 3.2 Patchlevel 5e --> + +<!-- CreationDate: Wed Dec 9 17:43:22 2015 --> + +<!-- Magnification: 1.000 --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="3.1in" + height="0.9in" + viewBox="-12 -12 3699 1074" + id="svg2" + version="1.1" + inkscape:version="0.48.4 r9939" + sodipodi:docname="TreeMapping.fig"> + <metadata + id="metadata66"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title></dc:title> + </cc:Work> + </rdf:RDF> + </metadata> + <defs + id="defs64"> + <marker + inkscape:stockid="Arrow2Lend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow2Lend" + style="overflow:visible;"> + <path + id="path3836" + style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;" + d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z " + transform="scale(1.1) rotate(180) translate(1,0)" /> + </marker> + <marker + inkscape:stockid="Arrow2Mend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow2Mend" + style="overflow:visible;"> + <path + id="path3842" + style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;" + d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z " + transform="scale(0.6) rotate(180) translate(0,0)" /> + </marker> + <marker + inkscape:stockid="Arrow1Mend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow1Mend" + style="overflow:visible;"> + <path + id="path3824" + d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z " + style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;" + transform="scale(0.4) rotate(180) translate(10,0)" /> + </marker> + </defs> + <sodipodi:namedview + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1" + objecttolerance="10" + gridtolerance="10" + guidetolerance="10" + inkscape:pageopacity="0" + inkscape:pageshadow="2" + inkscape:window-width="991" + inkscape:window-height="606" + id="namedview62" + showgrid="false" + inkscape:zoom="3.0752688" + inkscape:cx="139.5" + inkscape:cy="40.5" + inkscape:window-x="891" + inkscape:window-y="177" + inkscape:window-maximized="0" + inkscape:current-layer="g4" /> + <g + style="stroke-width:.025in; fill:none" + id="g4"> + <!-- Line: box --> + <rect + x="0" + y="0" + width="3675" + height="1050" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect6" /> + <!-- Line: box --> + <rect + x="75" + y="375" + width="375" + height="300" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect8" /> + <!-- Line: box --> + <rect + x="600" + y="375" + width="375" + height="300" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect10" /> + <!-- Line: box --> + <rect + x="1125" + y="375" + width="375" + height="300" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect12" /> + <!-- Line: box --> + <rect + x="1650" + y="375" + width="375" + height="300" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect14" /> + <!-- Line: box --> + <rect + x="2175" + y="375" + width="375" + height="300" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect16" /> + <!-- Line: box --> + <rect + x="3225" + y="375" + width="375" + height="300" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect18" /> + <!-- Line --> + <polyline + points="675,375 675,150 300,150 300,358 " + style="stroke:#000000;stroke-width:7.00088889;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline20" /> + <!-- Arrowhead on XXXpoint 300 150 - 300 390--> + <!-- Line --> + <polyline + points="1200,675 1200,900 300,900 300,691 " + style="stroke:#000000;stroke-width:7.00088889;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline24" /> + <!-- Arrowhead on XXXpoint 300 900 - 300 660--> + <!-- Line --> + <polyline + points="1725,375 1725,150 900,150 900,358 " + style="stroke:#000000;stroke-width:7.00088889;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline28" /> + <!-- Arrowhead on XXXpoint 900 150 - 900 390--> + <!-- Line --> + <polyline + points="2250,375 2250,75 825,75 825,358 " + style="stroke:#000000;stroke-width:7.00088889;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline32" /> + <!-- Arrowhead on XXXpoint 825 75 - 825 390--> + <!-- Line --> + <polyline + points="2775,675 2775,900 1425,900 1425,691 " + style="stroke:#000000;stroke-width:7.00088889;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline36" /> + <!-- Arrowhead on XXXpoint 1425 900 - 1425 660--> + <!-- Line --> + <polyline + points="3300,675 3300,975 1350,975 1350,691 " + style="stroke:#000000;stroke-width:7.00088889;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline40" /> + <!-- Arrowhead on XXXpoint 1350 975 - 1350 660--> + <!-- Line: box --> + <rect + x="2700" + y="375" + width="375" + height="300" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect44" /> + <!-- Text --> + <text + xml:space="preserve" + x="300" + y="525" + fill="#000000" + font-family="Times" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="middle" + id="text46">0:7 </text> + <!-- Text --> + <text + xml:space="preserve" + x="1350" + y="525" + fill="#000000" + font-family="Times" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="middle" + id="text48">4:7 </text> + <!-- Text --> + <text + xml:space="preserve" + x="1875" + y="525" + fill="#000000" + font-family="Times" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="middle" + id="text50">0:1 </text> + <!-- Text --> + <text + xml:space="preserve" + x="2400" + y="525" + fill="#000000" + font-family="Times" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="middle" + id="text52">2:3 </text> + <!-- Text --> + <text + xml:space="preserve" + x="2925" + y="525" + fill="#000000" + font-family="Times" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="middle" + id="text54">4:5 </text> + <!-- Text --> + <text + xml:space="preserve" + x="3450" + y="525" + fill="#000000" + font-family="Times" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="middle" + id="text56">6:7 </text> + <!-- Text --> + <text + xml:space="preserve" + x="825" + y="525" + fill="#000000" + font-family="Times" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="middle" + id="text58">0:3 </text> + <!-- Text --> + <text + xml:space="preserve" + x="3600" + y="150" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="end" + id="text60">struct rcu_state</text> + </g> +</svg> diff --git a/Documentation/RCU/Design/Data-Structures/TreeMappingLevel.svg b/Documentation/RCU/Design/Data-Structures/TreeMappingLevel.svg new file mode 100644 index 000000000000..5b416a4b8453 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/TreeMappingLevel.svg @@ -0,0 +1,380 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Creator: fig2dev Version 3.2 Patchlevel 5e --> + +<!-- CreationDate: Wed Dec 9 17:45:19 2015 --> + +<!-- Magnification: 1.000 --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="3.1in" + height="1.8in" + viewBox="-12 -12 3699 2124" + id="svg2" + version="1.1" + inkscape:version="0.48.4 r9939" + sodipodi:docname="TreeMappingLevel.svg"> + <metadata + id="metadata98"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title /> + </cc:Work> + </rdf:RDF> + </metadata> + <defs + id="defs96"> + <marker + inkscape:stockid="Arrow2Lend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow2Lend" + style="overflow:visible;"> + <path + id="path3868" + style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;" + d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z " + transform="scale(1.1) rotate(180) translate(1,0)" /> + </marker> + </defs> + <sodipodi:namedview + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1" + objecttolerance="10" + gridtolerance="10" + guidetolerance="10" + inkscape:pageopacity="0" + inkscape:pageshadow="2" + inkscape:window-width="1598" + inkscape:window-height="1211" + id="namedview94" + showgrid="false" + inkscape:zoom="5.2508961" + inkscape:cx="139.5" + inkscape:cy="81" + inkscape:window-x="840" + inkscape:window-y="122" + inkscape:window-maximized="0" + inkscape:current-layer="g4" /> + <g + style="stroke-width:.025in; fill:none" + id="g4"> + <!-- Line: box --> + <rect + x="0" + y="0" + width="3675" + height="2100" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect6" /> + <!-- Line: box --> + <rect + x="75" + y="1350" + width="750" + height="225" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect8" /> + <!-- Line: box --> + <rect + x="75" + y="1575" + width="750" + height="225" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect10" /> + <!-- Line: box --> + <rect + x="75" + y="1800" + width="750" + height="225" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect12" /> + <!-- Arc --> + <path + style="stroke:#000000;stroke-width:7;stroke-linecap:butt;" + d="M 1800,900 A 118 118 0 0 0 1800 1125 " + id="path14" /> + <!-- Arc --> + <path + style="stroke:#000000;stroke-width:7;stroke-linecap:butt;" + d="M 750,900 A 75 75 0 0 0 750 1050 " + id="path16" /> + <!-- Line --> + <polyline + points="750,900 750,691 " + style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline18" /> + <!-- Arrowhead on XXXpoint 750 900 - 750 660--> + <!-- Line: box --> + <rect + x="75" + y="375" + width="375" + height="300" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect22" /> + <!-- Line: box --> + <rect + x="600" + y="375" + width="375" + height="300" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect24" /> + <!-- Line: box --> + <rect + x="1650" + y="375" + width="375" + height="300" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect26" /> + <!-- Line: box --> + <rect + x="2175" + y="375" + width="375" + height="300" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect28" /> + <!-- Line: box --> + <rect + x="3225" + y="375" + width="375" + height="300" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect30" /> + <!-- Line --> + <polyline + points="675,375 675,150 300,150 300,358 " + style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline32" /> + <!-- Arrowhead on XXXpoint 300 150 - 300 390--> + <!-- Line --> + <polyline + points="1725,375 1725,150 900,150 900,358 " + style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline36" /> + <!-- Arrowhead on XXXpoint 900 150 - 900 390--> + <!-- Line --> + <polyline + points="2250,375 2250,75 825,75 825,358 " + style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline40" /> + <!-- Arrowhead on XXXpoint 825 75 - 825 390--> + <!-- Line --> + <polyline + points="2775,675 2775,975 1425,975 1425,691 " + style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline44" /> + <!-- Arrowhead on XXXpoint 1425 975 - 1425 660--> + <!-- Line: box --> + <rect + x="2700" + y="375" + width="375" + height="300" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect48" /> + <!-- Line: box --> + <rect + x="1125" + y="375" + width="375" + height="300" + rx="0" + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect50" /> + <!-- Line --> + <polyline + points="3300,675 3300,1050 1350,1050 1350,691 " + style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline52" /> + <!-- Arrowhead on XXXpoint 1350 1050 - 1350 660--> + <!-- Line --> + <polyline + points="825,1425 975,1425 975,1200 225,1200 225,691 " + style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline56" /> + <!-- Arrowhead on XXXpoint 225 1200 - 225 660--> + <!-- Line --> + <polyline + points="1200,675 1200,975 300,975 300,691 " + style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline60" /> + <!-- Arrowhead on XXXpoint 300 975 - 300 660--> + <!-- Text --> + <text + xml:space="preserve" + x="150" + y="1500" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="108" + text-anchor="start" + id="text64">->level[0]</text> + <!-- Text --> + <text + xml:space="preserve" + x="150" + y="1725" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="108" + text-anchor="start" + id="text66">->level[1]</text> + <!-- Text --> + <text + xml:space="preserve" + x="150" + y="1950" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="108" + text-anchor="start" + id="text68">->level[2]</text> + <!-- Text --> + <text + xml:space="preserve" + x="300" + y="525" + fill="#000000" + font-family="Times" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="middle" + id="text70">0:7 </text> + <!-- Text --> + <text + xml:space="preserve" + x="1350" + y="525" + fill="#000000" + font-family="Times" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="middle" + id="text72">4:7 </text> + <!-- Text --> + <text + xml:space="preserve" + x="1875" + y="525" + fill="#000000" + font-family="Times" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="middle" + id="text74">0:1 </text> + <!-- Text --> + <text + xml:space="preserve" + x="2400" + y="525" + fill="#000000" + font-family="Times" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="middle" + id="text76">2:3 </text> + <!-- Text --> + <text + xml:space="preserve" + x="2925" + y="525" + fill="#000000" + font-family="Times" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="middle" + id="text78">4:5 </text> + <!-- Text --> + <text + xml:space="preserve" + x="3450" + y="525" + fill="#000000" + font-family="Times" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="middle" + id="text80">6:7 </text> + <!-- Text --> + <text + xml:space="preserve" + x="825" + y="525" + fill="#000000" + font-family="Times" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="middle" + id="text82">0:3 </text> + <!-- Text --> + <text + xml:space="preserve" + x="3600" + y="150" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="normal" + font-size="96" + text-anchor="end" + id="text84">struct rcu_state</text> + <!-- Line --> + <polyline + points="825,1875 1800,1875 1800,1125 " + style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:none" + id="polyline86" /> + <!-- Line --> + <polyline + points="1800,900 1800,691 " + style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)" + id="polyline88" /> + <!-- Arrowhead on XXXpoint 1800 900 - 1800 660--> + <!-- Line --> + <polyline + points="825,1650 1200,1650 1200,1125 750,1125 750,1050 " + style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline92" /> + </g> +</svg> diff --git a/Documentation/RCU/Design/Data-Structures/blkd_task.svg b/Documentation/RCU/Design/Data-Structures/blkd_task.svg new file mode 100644 index 000000000000..00e810bb8419 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/blkd_task.svg @@ -0,0 +1,843 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Creator: fig2dev Version 3.2 Patchlevel 5e --> + +<!-- CreationDate: Wed Dec 9 17:35:03 2015 --> + +<!-- Magnification: 2.000 --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="10.1in" + height="8.6in" + viewBox="-44 -44 12088 10288" + id="svg2" + version="1.1" + inkscape:version="0.48.4 r9939" + sodipodi:docname="blkd_task.fig"> + <metadata + id="metadata212"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title></dc:title> + </cc:Work> + </rdf:RDF> + </metadata> + <defs + id="defs210"> + <marker + inkscape:stockid="Arrow1Mend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow1Mend" + style="overflow:visible;"> + <path + id="path3970" + d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z " + style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;" + transform="scale(0.4) rotate(180) translate(10,0)" /> + </marker> + </defs> + <sodipodi:namedview + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1" + objecttolerance="10" + gridtolerance="10" + guidetolerance="10" + inkscape:pageopacity="0" + inkscape:pageshadow="2" + inkscape:window-width="1087" + inkscape:window-height="1144" + id="namedview208" + showgrid="false" + inkscape:zoom="1.0495049" + inkscape:cx="454.50003" + inkscape:cy="387.00003" + inkscape:window-x="833" + inkscape:window-y="28" + inkscape:window-maximized="0" + inkscape:current-layer="g4" /> + <g + style="stroke-width:.025in; fill:none" + id="g4"> + <!-- Line: box --> + <rect + x="450" + y="0" + width="6300" + height="7350" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect6" /> + <!-- Line: box --> + <rect + x="4950" + y="4950" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect8" /> + <!-- Line: box --> + <rect + x="750" + y="600" + width="5700" + height="3750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect10" /> + <!-- Line --> + <polyline + points="5250,8100 5688,5912 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline12" /> + <!-- Arrowhead on XXXpoint 5250 8100 - 5710 5790--> + <polyline + points="5714 6068 5704 5822 5598 6044 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline14" /> + <!-- Line --> + <polyline + points="4050,9300 4486,7262 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline16" /> + <!-- Arrowhead on XXXpoint 4050 9300 - 4512 7140--> + <polyline + points="4514 7418 4506 7172 4396 7394 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline18" /> + <!-- Line --> + <polyline + points="1040,9300 1476,7262 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline20" /> + <!-- Arrowhead on XXXpoint 1040 9300 - 1502 7140--> + <polyline + points="1504 7418 1496 7172 1386 7394 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline22" /> + <!-- Line --> + <polyline + points="2240,8100 2676,6062 " + style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="polyline24" /> + <!-- Arrowhead on XXXpoint 2240 8100 - 2702 5940--> + <polyline + points="2704 6218 2696 5972 2586 6194 " + style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; " + id="polyline26" /> + <!-- Line: box --> + <rect + x="0" + y="450" + width="6300" + height="7350" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; " + id="rect28" /> + <!-- Line: box --> + <rect + x="300" + y="1050" + width="5700" + height="3750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; " + id="rect30" /> + <!-- Line --> + <polyline + points="1350,3450 2350,2590 " + style="stroke:#00d1d1;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline32" /> + <!-- Arrowhead on XXXpoint 1350 3450 - 2444 2510--> + <!-- Line --> + <polyline + points="4950,3450 3948,2590 " + style="stroke:#00d1d1;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline36" /> + <!-- Arrowhead on XXXpoint 4950 3450 - 3854 2510--> + <!-- Line --> + <polyline + points="4050,6600 4050,4414 " + style="stroke:#00d1d1;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline40" /> + <!-- Arrowhead on XXXpoint 4050 6600 - 4050 4290--> + <!-- Line --> + <polyline + points="1050,6600 1050,4414 " + style="stroke:#00d1d1;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline44" /> + <!-- Arrowhead on XXXpoint 1050 6600 - 1050 4290--> + <!-- Line --> + <polyline + points="2250,5400 2250,4414 " + style="stroke:#00d1d1;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline48" /> + <!-- Arrowhead on XXXpoint 2250 5400 - 2250 4290--> + <!-- Line --> + <polyline + points="2250,8100 2250,6364 " + style="stroke:#00ff00;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline52" /> + <!-- Arrowhead on XXXpoint 2250 8100 - 2250 6240--> + <!-- Line --> + <polyline + points="1050,9300 1050,7564 " + style="stroke:#00ff00;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline56" /> + <!-- Arrowhead on XXXpoint 1050 9300 - 1050 7440--> + <!-- Line --> + <polyline + points="4050,9300 4050,7564 " + style="stroke:#00ff00;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline60" /> + <!-- Arrowhead on XXXpoint 4050 9300 - 4050 7440--> + <!-- Line --> + <polyline + points="5250,8100 5250,6364 " + style="stroke:#00ff00;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline64" /> + <!-- Arrowhead on XXXpoint 5250 8100 - 5250 6240--> + <!-- Circle --> + <circle + cx="2850" + cy="3900" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle68" /> + <!-- Circle --> + <circle + cx="3150" + cy="3900" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle70" /> + <!-- Circle --> + <circle + cx="3450" + cy="3900" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle72" /> + <!-- Circle --> + <circle + cx="1350" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle74" /> + <!-- Circle --> + <circle + cx="1650" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle76" /> + <!-- Circle --> + <circle + cx="1950" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle78" /> + <!-- Circle --> + <circle + cx="4350" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle80" /> + <!-- Circle --> + <circle + cx="4650" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle82" /> + <!-- Circle --> + <circle + cx="4950" + cy="5100" + r="76" + style="fill:#000000;stroke:#000000;stroke-width:14;" + id="circle84" /> + <!-- Line: box --> + <rect + x="750" + y="3450" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect86" /> + <!-- Line: box --> + <rect + x="300" + y="6600" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect88" /> + <!-- Line: box --> + <rect + x="4500" + y="5400" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect90" /> + <!-- Line: box --> + <rect + x="3300" + y="6600" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect92" /> + <!-- Line: box --> + <rect + x="2250" + y="1650" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect94" /> + <!-- Line: box --> + <rect + x="0" + y="9300" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect96" /> + <!-- Line: box --> + <rect + x="1350" + y="8100" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect98" /> + <!-- Line: box --> + <rect + x="3000" + y="9300" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect100" /> + <!-- Line: box --> + <rect + x="4350" + y="8100" + width="2100" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; " + id="rect102" /> + <!-- Line: box --> + <rect + x="1500" + y="5400" + width="1500" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect104" /> + <!-- Line --> + <polygon + points="5550,3450 7350,2850 7350,5100 5550,4350 5550,3450 " + style="stroke:#000000;stroke-width:14; stroke-linejoin:miter; stroke-linecap:butt; stroke-dasharray:120 120;fill:#ffbfbf; " + id="polygon106" /> + <!-- Line --> + <polyline + points="9300,3150 10734,3150 " + style="stroke:#000000;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline108" /> + <!-- Arrowhead on XXXpoint 9300 3150 - 10860 3150--> + <!-- Line: box --> + <rect + x="10800" + y="2850" + width="1200" + height="750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect112" /> + <!-- Line --> + <polyline + points="11400,3600 11400,4284 " + style="stroke:#000000;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline114" /> + <!-- Arrowhead on XXXpoint 11400 3600 - 11400 4410--> + <!-- Line: box --> + <rect + x="10800" + y="4350" + width="1200" + height="750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect118" /> + <!-- Line --> + <polyline + points="11400,5100 11400,5784 " + style="stroke:#000000;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline120" /> + <!-- Arrowhead on XXXpoint 11400 5100 - 11400 5910--> + <!-- Line: box --> + <rect + x="10800" + y="5850" + width="1200" + height="750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect124" /> + <!-- Line --> + <polyline + points="9300,3900 9900,3900 9900,4650 10734,4650 " + style="stroke:#000000;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline126" /> + <!-- Arrowhead on XXXpoint 9900 4650 - 10860 4650--> + <!-- Line --> + <polyline + points="9300,4650 9600,4650 9600,6150 10734,6150 " + style="stroke:#000000;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline130" /> + <!-- Arrowhead on XXXpoint 9600 6150 - 10860 6150--> + <!-- Text --> + <text + xml:space="preserve" + x="6450" + y="300" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="192" + text-anchor="end" + id="text134">rcu_bh</text> + <!-- Text --> + <text + xml:space="preserve" + x="3150" + y="1950" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text136">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="3150" + y="2250" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text138">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="1650" + y="3750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text140">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1650" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text142">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="2250" + y="5700" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text144">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2250" + y="6000" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text146">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="6900" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text148">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="7200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text150">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="5250" + y="5700" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text152">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5250" + y="6000" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text154">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="6900" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text156">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="7200" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text158">rcu_data</text> + <!-- Text --> + <text + xml:space="preserve" + x="450" + y="1350" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="start" + id="text160">struct rcu_state</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="9600" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text162">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="1050" + y="9900" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text164">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="9600" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text166">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="4050" + y="9900" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text168">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="2400" + y="8400" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text170">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="2400" + y="8700" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text172">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="5400" + y="8400" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text174">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="5400" + y="8700" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text176">rcu_dynticks</text> + <!-- Text --> + <text + xml:space="preserve" + x="6000" + y="750" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="192" + text-anchor="end" + id="text178">rcu_sched</text> + <!-- Text --> + <text + xml:space="preserve" + x="11400" + y="3300" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="216" + text-anchor="middle" + id="text180">T3</text> + <!-- Text --> + <text + xml:space="preserve" + x="11400" + y="4800" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="216" + text-anchor="middle" + id="text182">T2</text> + <!-- Text --> + <text + xml:space="preserve" + x="11400" + y="6300" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="216" + text-anchor="middle" + id="text184">T1</text> + <!-- Line --> + <polyline + points="5250,5400 5250,4414 " + style="stroke:#00d1d1;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline186" /> + <!-- Arrowhead on XXXpoint 5250 5400 - 5250 4290--> + <!-- Line: box --> + <rect + x="3750" + y="3450" + width="1800" + height="900" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect190" /> + <!-- Line: box --> + <rect + x="7350" + y="2850" + width="1950" + height="750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect192" /> + <!-- Line: box --> + <rect + x="7350" + y="3600" + width="1950" + height="750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect194" /> + <!-- Line: box --> + <rect + x="7350" + y="4350" + width="1950" + height="750" + rx="0" + style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; " + id="rect196" /> + <!-- Text --> + <text + xml:space="preserve" + x="4650" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text198">rcu_node</text> + <!-- Text --> + <text + xml:space="preserve" + x="4650" + y="3750" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="middle" + id="text200">struct</text> + <!-- Text --> + <text + xml:space="preserve" + x="7500" + y="3300" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="start" + id="text202">blkd_tasks</text> + <!-- Text --> + <text + xml:space="preserve" + x="7500" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="start" + id="text204">gp_tasks</text> + <!-- Text --> + <text + xml:space="preserve" + x="7500" + y="4800" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="192" + text-anchor="start" + id="text206">exp_tasks</text> + </g> +</svg> diff --git a/Documentation/RCU/Design/Data-Structures/nxtlist.svg b/Documentation/RCU/Design/Data-Structures/nxtlist.svg new file mode 100644 index 000000000000..abc4cc73a097 --- /dev/null +++ b/Documentation/RCU/Design/Data-Structures/nxtlist.svg @@ -0,0 +1,396 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- Creator: fig2dev Version 3.2 Patchlevel 5e --> + +<!-- CreationDate: Wed Dec 9 17:39:46 2015 --> + +<!-- Magnification: 3.000 --> + +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="10.4in" + height="10.4in" + viewBox="-66 -66 12507 12507" + id="svg2" + version="1.1" + inkscape:version="0.48.4 r9939" + sodipodi:docname="nxtlist.fig"> + <metadata + id="metadata94"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title></dc:title> + </cc:Work> + </rdf:RDF> + </metadata> + <defs + id="defs92"> + <marker + inkscape:stockid="Arrow1Mend" + orient="auto" + refY="0.0" + refX="0.0" + id="Arrow1Mend" + style="overflow:visible;"> + <path + id="path3852" + d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z " + style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;" + transform="scale(0.4) rotate(180) translate(10,0)" /> + </marker> + </defs> + <sodipodi:namedview + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1" + objecttolerance="10" + gridtolerance="10" + guidetolerance="10" + inkscape:pageopacity="0" + inkscape:pageshadow="2" + inkscape:window-width="925" + inkscape:window-height="928" + id="namedview90" + showgrid="false" + inkscape:zoom="0.80021373" + inkscape:cx="467.99997" + inkscape:cy="467.99997" + inkscape:window-x="948" + inkscape:window-y="73" + inkscape:window-maximized="0" + inkscape:current-layer="g4" /> + <g + style="stroke-width:.025in; fill:none" + id="g4"> + <!-- Line: box --> + <rect + x="0" + y="0" + width="7875" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect6" /> + <!-- Line: box --> + <rect + x="0" + y="1125" + width="7875" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect8" /> + <!-- Line: box --> + <rect + x="0" + y="2250" + width="7875" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect10" /> + <!-- Line: box --> + <rect + x="0" + y="3375" + width="7875" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect12" /> + <!-- Line: box --> + <rect + x="0" + y="4500" + width="7875" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; " + id="rect14" /> + <!-- Line: box --> + <rect + x="10575" + y="0" + width="1800" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect16" /> + <!-- Line: box --> + <rect + x="10575" + y="1125" + width="1800" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect18" /> + <!-- Line --> + <polyline + points="11475,2250 11475,3276 " + style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline20" /> + <!-- Arrowhead on XXXpoint 11475 2250 - 11475 3465--> + <!-- Line: box --> + <rect + x="10575" + y="6750" + width="1800" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect24" /> + <!-- Line: box --> + <rect + x="10575" + y="7875" + width="1800" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect26" /> + <!-- Line: box --> + <rect + x="10575" + y="10125" + width="1800" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect28" /> + <!-- Line: box --> + <rect + x="10575" + y="11250" + width="1800" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect30" /> + <!-- Line: box --> + <rect + x="10575" + y="3375" + width="1800" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect32" /> + <!-- Line --> + <polyline + points="11475,5625 11475,6651 " + style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline34" /> + <!-- Arrowhead on XXXpoint 11475 5625 - 11475 6840--> + <!-- Line --> + <polyline + points="7875,225 10476,225 " + style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline38" /> + <!-- Arrowhead on XXXpoint 7875 225 - 10665 225--> + <!-- Line --> + <polyline + points="7875,1350 9675,1350 9675,675 7971,675 " + style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline42" /> + <!-- Arrowhead on XXXpoint 9675 675 - 7785 675--> + <!-- Line --> + <polyline + points="7875,2475 9675,2475 9675,4725 10476,4725 " + style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline46" /> + <!-- Arrowhead on XXXpoint 9675 4725 - 10665 4725--> + <!-- Line --> + <polyline + points="7875,3600 9225,3600 9225,5175 10476,5175 " + style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline50" /> + <!-- Arrowhead on XXXpoint 9225 5175 - 10665 5175--> + <!-- Line --> + <polyline + points="7875,4725 8775,4725 8775,11475 10476,11475 " + style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline54" /> + <!-- Arrowhead on XXXpoint 8775 11475 - 10665 11475--> + <!-- Line: box --> + <rect + x="10575" + y="4500" + width="1800" + height="1125" + rx="0" + style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; " + id="rect58" /> + <!-- Line --> + <polyline + points="11475,9000 11475,10026 " + style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)" + id="polyline60" /> + <!-- Arrowhead on XXXpoint 11475 9000 - 11475 10215--> + <!-- Text --> + <text + xml:space="preserve" + x="225" + y="675" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="324" + text-anchor="start" + id="text64">nxtlist</text> + <!-- Text --> + <text + xml:space="preserve" + x="225" + y="1800" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="324" + text-anchor="start" + id="text66">nxttail[RCU_DONE_TAIL]</text> + <!-- Text --> + <text + xml:space="preserve" + x="225" + y="2925" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="324" + text-anchor="start" + id="text68">nxttail[RCU_WAIT_TAIL]</text> + <!-- Text --> + <text + xml:space="preserve" + x="225" + y="4050" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="324" + text-anchor="start" + id="text70">nxttail[RCU_NEXT_READY_TAIL]</text> + <!-- Text --> + <text + xml:space="preserve" + x="225" + y="5175" + fill="#000000" + font-family="Courier" + font-style="normal" + font-weight="bold" + font-size="324" + text-anchor="start" + id="text72">nxttail[RCU_NEXT_TAIL]</text> + <!-- Text --> + <text + xml:space="preserve" + x="11475" + y="675" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text74">CB 1</text> + <!-- Text --> + <text + xml:space="preserve" + x="11475" + y="1800" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text76">next</text> + <!-- Text --> + <text + xml:space="preserve" + x="11475" + y="7425" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text78">CB 3</text> + <!-- Text --> + <text + xml:space="preserve" + x="11475" + y="8550" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text80">next</text> + <!-- Text --> + <text + xml:space="preserve" + x="11475" + y="10800" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text82">CB 4</text> + <!-- Text --> + <text + xml:space="preserve" + x="11475" + y="11925" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text84">next</text> + <!-- Text --> + <text + xml:space="preserve" + x="11475" + y="4050" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text86">CB 2</text> + <!-- Text --> + <text + xml:space="preserve" + x="11475" + y="5175" + fill="#000000" + font-family="Helvetica" + font-style="normal" + font-weight="normal" + font-size="324" + text-anchor="middle" + id="text88">next</text> + </g> +</svg> diff --git a/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png b/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png Binary files differdeleted file mode 100644 index 7496a55e4e7b..000000000000 --- a/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png +++ /dev/null diff --git a/Documentation/RCU/Design/Requirements/RCUApplicability.svg b/Documentation/RCU/Design/Requirements/RCUApplicability.svg deleted file mode 100644 index ebcbeee391ed..000000000000 --- a/Documentation/RCU/Design/Requirements/RCUApplicability.svg +++ /dev/null @@ -1,237 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" standalone="no"?> -<!-- Creator: fig2dev Version 3.2 Patchlevel 5d --> - -<!-- CreationDate: Tue Mar 4 18:34:25 2014 --> - -<!-- Magnification: 3.000 --> - -<svg - xmlns:dc="http://purl.org/dc/elements/1.1/" - xmlns:cc="http://creativecommons.org/ns#" - xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" - xmlns:svg="http://www.w3.org/2000/svg" - xmlns="http://www.w3.org/2000/svg" - xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" - xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" - width="1089.1382" - height="668.21368" - viewBox="-2121 -36 14554.634 8876.4061" - id="svg2" - version="1.1" - inkscape:version="0.48.3.1 r9886" - sodipodi:docname="RCUApplicability.svg"> - <metadata - id="metadata40"> - <rdf:RDF> - <cc:Work - rdf:about=""> - <dc:format>image/svg+xml</dc:format> - <dc:type - rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> - <dc:title /> - </cc:Work> - </rdf:RDF> - </metadata> - <defs - id="defs38" /> - <sodipodi:namedview - pagecolor="#ffffff" - bordercolor="#666666" - borderopacity="1" - objecttolerance="10" - gridtolerance="10" - guidetolerance="10" - inkscape:pageopacity="0" - inkscape:pageshadow="2" - inkscape:window-width="849" - inkscape:window-height="639" - id="namedview36" - showgrid="false" - inkscape:zoom="0.51326165" - inkscape:cx="544.56912" - inkscape:cy="334.10686" - inkscape:window-x="149" - inkscape:window-y="448" - inkscape:window-maximized="0" - inkscape:current-layer="g4" - fit-margin-top="5" - fit-margin-left="5" - fit-margin-right="5" - fit-margin-bottom="5" /> - <g - style="fill:none;stroke-width:0.025in" - id="g4" - transform="translate(-2043.6828,14.791398)"> - <!-- Line: box --> - <rect - x="0" - y="0" - width="14400" - height="8775" - rx="0" - style="fill:#ffa1a1;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter" - id="rect6" /> - <!-- Line: box --> - <rect - x="1350" - y="0" - width="11700" - height="6075" - rx="0" - style="fill:#ffff00;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter" - id="rect8" /> - <!-- Line: box --> - <rect - x="2700" - y="0" - width="9000" - height="4275" - rx="0" - style="fill:#00ff00;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter" - id="rect10" /> - <!-- Line: box --> - <rect - x="4050" - y="0" - width="6300" - height="2475" - rx="0" - style="fill:#87cfff;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter" - id="rect12" /> - <!-- Text --> - <text - xml:space="preserve" - x="7200" - y="900" - font-style="normal" - font-weight="normal" - font-size="324" - id="text14" - sodipodi:linespacing="125%" - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" - id="tspan3017">Read-Mostly, Stale &</tspan></text> - <!-- Text --> - <text - xml:space="preserve" - x="7200" - y="1350" - font-style="normal" - font-weight="normal" - font-size="324" - id="text16" - sodipodi:linespacing="125%" - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" - id="tspan3019">Inconsistent Data OK</tspan></text> - <!-- Text --> - <text - xml:space="preserve" - x="7200" - y="1800" - font-style="normal" - font-weight="normal" - font-size="324" - id="text18" - sodipodi:linespacing="125%" - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" - id="tspan3021">(RCU Works Great!!!)</tspan></text> - <!-- Text --> - <text - xml:space="preserve" - x="7200" - y="3825" - font-style="normal" - font-weight="normal" - font-size="324" - id="text20" - sodipodi:linespacing="125%" - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" - id="tspan3023">(RCU Works Well)</tspan></text> - <!-- Text --> - <text - xml:space="preserve" - x="7200" - y="3375" - font-style="normal" - font-weight="normal" - font-size="324" - id="text22" - sodipodi:linespacing="125%" - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" - id="tspan3025">Read-Mostly, Need Consistent Data</tspan></text> - <!-- Text --> - <text - xml:space="preserve" - x="7200" - y="5175" - font-style="normal" - font-weight="normal" - font-size="324" - id="text24" - sodipodi:linespacing="125%" - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" - id="tspan3027">Read-Write, Need Consistent Data</tspan></text> - <!-- Text --> - <text - xml:space="preserve" - x="7200" - y="6975" - font-style="normal" - font-weight="normal" - font-size="324" - id="text26" - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" - sodipodi:linespacing="125%">Update-Mostly, Need Consistent Data</text> - <!-- Text --> - <text - xml:space="preserve" - x="7200" - y="5625" - font-style="normal" - font-weight="normal" - font-size="324" - id="text28" - sodipodi:linespacing="125%" - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" - id="tspan3029">(RCU Might Be OK...)</tspan></text> - <!-- Text --> - <text - xml:space="preserve" - x="7200" - y="7875" - font-style="normal" - font-weight="normal" - font-size="324" - id="text30" - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" - sodipodi:linespacing="125%">(1) Provide Existence Guarantees For Update-Friendly Mechanisms</text> - <!-- Text --> - <text - xml:space="preserve" - x="7200" - y="8325" - font-style="normal" - font-weight="normal" - font-size="324" - id="text32" - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" - sodipodi:linespacing="125%">(2) Provide Wait-Free Read-Side Primitives for Real-Time Use)</text> - <!-- Text --> - <text - xml:space="preserve" - x="7200" - y="7425" - font-style="normal" - font-weight="normal" - font-size="324" - id="text34" - style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L" - sodipodi:linespacing="125%">(RCU is Very Unlikely to be the Right Tool For The Job, But it Can:</text> - </g> -</svg> diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index a725f9900ec8..e7e24b3e86e2 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -1,5 +1,3 @@ -<!-- DO NOT HAND EDIT. --> -<!-- Instead, edit Documentation/RCU/Design/Requirements/Requirements.htmlx and run 'sh htmlqqz.sh Documentation/RCU/Design/Requirements/Requirements' --> <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> @@ -65,8 +63,8 @@ All that aside, here are the categories of currently known RCU requirements: <p> This is followed by a <a href="#Summary">summary</a>, -which is in turn followed by the inevitable -<a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>. +however, the answers to each quick quiz immediately follows the quiz. +Select the big white space with your mouse to see the answer. <h2><a name="Fundamental Requirements">Fundamental Requirements</a></h2> @@ -153,13 +151,27 @@ Therefore, the outcome: </blockquote> cannot happen. -<p><a name="Quick Quiz 1"><b>Quick Quiz 1</b>:</a> -Wait a minute! -You said that updaters can make useful forward progress concurrently -with readers, but pre-existing readers will block -<tt>synchronize_rcu()</tt>!!! -Just who are you trying to fool??? -<br><a href="#qq1answer">Answer</a> +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + Wait a minute! + You said that updaters can make useful forward progress concurrently + with readers, but pre-existing readers will block + <tt>synchronize_rcu()</tt>!!! + Just who are you trying to fool??? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + First, if updaters do not wish to be blocked by readers, they can use + <tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will + be discussed later. + Second, even when using <tt>synchronize_rcu()</tt>, the other + update-side code does run concurrently with readers, whether + pre-existing or not. +</font></td></tr> +<tr><td> </td></tr> +</table> <p> This scenario resembles one of the first uses of RCU in @@ -210,9 +222,20 @@ to guarantee that <tt>do_something()</tt> never runs concurrently with <tt>recovery()</tt>, but with little or no synchronization overhead in <tt>do_something_dlm()</tt>. -<p><a name="Quick Quiz 2"><b>Quick Quiz 2</b>:</a> -Why is the <tt>synchronize_rcu()</tt> on line 28 needed? -<br><a href="#qq2answer">Answer</a> +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + Why is the <tt>synchronize_rcu()</tt> on line 28 needed? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + Without that extra grace period, memory reordering could result in + <tt>do_something_dlm()</tt> executing <tt>do_something()</tt> + concurrently with the last bits of <tt>recovery()</tt>. +</font></td></tr> +<tr><td> </td></tr> +</table> <p> In order to avoid fatal problems such as deadlocks, @@ -332,12 +355,27 @@ It also prevents any number of “interesting” compiler optimizations, for example, the use of <tt>gp</tt> as a scratch location immediately preceding the assignment. -<p><a name="Quick Quiz 3"><b>Quick Quiz 3</b>:</a> -But <tt>rcu_assign_pointer()</tt> does nothing to prevent the -two assignments to <tt>p->a</tt> and <tt>p->b</tt> -from being reordered. -Can't that also cause problems? -<br><a href="#qq3answer">Answer</a> +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + But <tt>rcu_assign_pointer()</tt> does nothing to prevent the + two assignments to <tt>p->a</tt> and <tt>p->b</tt> + from being reordered. + Can't that also cause problems? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + No, it cannot. + The readers cannot see either of these two fields until + the assignment to <tt>gp</tt>, by which time both fields are + fully initialized. + So reordering the assignments + to <tt>p->a</tt> and <tt>p->b</tt> cannot possibly + cause any problems. +</font></td></tr> +<tr><td> </td></tr> +</table> <p> It is tempting to assume that the reader need not do anything special @@ -494,11 +532,42 @@ The <tt>rcu_access_pointer()</tt> on line 6 is similar to code protected by the corresponding update-side lock. </ol> -<p><a name="Quick Quiz 4"><b>Quick Quiz 4</b>:</a> -Without the <tt>rcu_dereference()</tt> or the -<tt>rcu_access_pointer()</tt>, what destructive optimizations -might the compiler make use of? -<br><a href="#qq4answer">Answer</a> +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + Without the <tt>rcu_dereference()</tt> or the + <tt>rcu_access_pointer()</tt>, what destructive optimizations + might the compiler make use of? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + Let's start with what happens to <tt>do_something_gp()</tt> + if it fails to use <tt>rcu_dereference()</tt>. + It could reuse a value formerly fetched from this same pointer. + It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time + manner, resulting in <i>load tearing</i>, in turn resulting a bytewise + mash-up of two distince pointer values. + It might even use value-speculation optimizations, where it makes + a wrong guess, but by the time it gets around to checking the + value, an update has changed the pointer to match the wrong guess. + Too bad about any dereferences that returned pre-initialization garbage + in the meantime! + </font> + + <p><font color="ffffff"> + For <tt>remove_gp_synchronous()</tt>, as long as all modifications + to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>, + the above optimizations are harmless. + However, + with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>, + <tt>sparse</tt> will complain if you + define <tt>gp</tt> with <tt>__rcu</tt> and then + access it without using + either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>. +</font></td></tr> +<tr><td> </td></tr> +</table> <p> In short, RCU's publish-subscribe guarantee is provided by the combination @@ -571,17 +640,156 @@ systems with more than one CPU: <tt>synchronize_rcu()</tt> migrates in the meantime. </ol> -<p><a name="Quick Quiz 5"><b>Quick Quiz 5</b>:</a> -Given that multiple CPUs can start RCU read-side critical sections -at any time without any ordering whatsoever, how can RCU possibly tell whether -or not a given RCU read-side critical section starts before a -given instance of <tt>synchronize_rcu()</tt>? -<br><a href="#qq5answer">Answer</a> - -<p><a name="Quick Quiz 6"><b>Quick Quiz 6</b>:</a> -The first and second guarantees require unbelievably strict ordering! -Are all these memory barriers <i> really</i> required? -<br><a href="#qq6answer">Answer</a> +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + Given that multiple CPUs can start RCU read-side critical sections + at any time without any ordering whatsoever, how can RCU possibly + tell whether or not a given RCU read-side critical section starts + before a given instance of <tt>synchronize_rcu()</tt>? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + If RCU cannot tell whether or not a given + RCU read-side critical section starts before a + given instance of <tt>synchronize_rcu()</tt>, + then it must assume that the RCU read-side critical section + started first. + In other words, a given instance of <tt>synchronize_rcu()</tt> + can avoid waiting on a given RCU read-side critical section only + if it can prove that <tt>synchronize_rcu()</tt> started first. +</font></td></tr> +<tr><td> </td></tr> +</table> + +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + The first and second guarantees require unbelievably strict ordering! + Are all these memory barriers <i> really</i> required? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + Yes, they really are required. + To see why the first guarantee is required, consider the following + sequence of events: + </font> + + <ol> + <li> <font color="ffffff"> + CPU 1: <tt>rcu_read_lock()</tt> + </font> + <li> <font color="ffffff"> + CPU 1: <tt>q = rcu_dereference(gp); + /* Very likely to return p. */</tt> + </font> + <li> <font color="ffffff"> + CPU 0: <tt>list_del_rcu(p);</tt> + </font> + <li> <font color="ffffff"> + CPU 0: <tt>synchronize_rcu()</tt> starts. + </font> + <li> <font color="ffffff"> + CPU 1: <tt>do_something_with(q->a); + /* No smp_mb(), so might happen after kfree(). */</tt> + </font> + <li> <font color="ffffff"> + CPU 1: <tt>rcu_read_unlock()</tt> + </font> + <li> <font color="ffffff"> + CPU 0: <tt>synchronize_rcu()</tt> returns. + </font> + <li> <font color="ffffff"> + CPU 0: <tt>kfree(p);</tt> + </font> + </ol> + + <p><font color="ffffff"> + Therefore, there absolutely must be a full memory barrier between the + end of the RCU read-side critical section and the end of the + grace period. + </font> + + <p><font color="ffffff"> + The sequence of events demonstrating the necessity of the second rule + is roughly similar: + </font> + + <ol> + <li> <font color="ffffff">CPU 0: <tt>list_del_rcu(p);</tt> + </font> + <li> <font color="ffffff">CPU 0: <tt>synchronize_rcu()</tt> starts. + </font> + <li> <font color="ffffff">CPU 1: <tt>rcu_read_lock()</tt> + </font> + <li> <font color="ffffff">CPU 1: <tt>q = rcu_dereference(gp); + /* Might return p if no memory barrier. */</tt> + </font> + <li> <font color="ffffff">CPU 0: <tt>synchronize_rcu()</tt> returns. + </font> + <li> <font color="ffffff">CPU 0: <tt>kfree(p);</tt> + </font> + <li> <font color="ffffff"> + CPU 1: <tt>do_something_with(q->a); /* Boom!!! */</tt> + </font> + <li> <font color="ffffff">CPU 1: <tt>rcu_read_unlock()</tt> + </font> + </ol> + + <p><font color="ffffff"> + And similarly, without a memory barrier between the beginning of the + grace period and the beginning of the RCU read-side critical section, + CPU 1 might end up accessing the freelist. + </font> + + <p><font color="ffffff"> + The “as if” rule of course applies, so that any + implementation that acts as if the appropriate memory barriers + were in place is a correct implementation. + That said, it is much easier to fool yourself into believing + that you have adhered to the as-if rule than it is to actually + adhere to it! +</font></td></tr> +<tr><td> </td></tr> +</table> + +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + You claim that <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt> + generate absolutely no code in some kernel builds. + This means that the compiler might arbitrarily rearrange consecutive + RCU read-side critical sections. + Given such rearrangement, if a given RCU read-side critical section + is done, how can you be sure that all prior RCU read-side critical + sections are done? + Won't the compiler rearrangements make that impossible to determine? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + In cases where <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt> + generate absolutely no code, RCU infers quiescent states only at + special locations, for example, within the scheduler. + Because calls to <tt>schedule()</tt> had better prevent calling-code + accesses to shared variables from being rearranged across the call to + <tt>schedule()</tt>, if RCU detects the end of a given RCU read-side + critical section, it will necessarily detect the end of all prior + RCU read-side critical sections, no matter how aggressively the + compiler scrambles the code. + </font> + + <p><font color="ffffff"> + Again, this all assumes that the compiler cannot scramble code across + calls to the scheduler, out of interrupt handlers, into the idle loop, + into user-mode code, and so on. + But if your kernel build allows that sort of scrambling, you have broken + far more than just RCU! +</font></td></tr> +<tr><td> </td></tr> +</table> <p> Note that these memory-barrier requirements do not replace the fundamental @@ -626,9 +834,19 @@ inconvenience can be avoided through use of the <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt> API members described later in this document. -<p><a name="Quick Quiz 7"><b>Quick Quiz 7</b>:</a> -But how does the upgrade-to-write operation exclude other readers? -<br><a href="#qq7answer">Answer</a> +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + But how does the upgrade-to-write operation exclude other readers? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + It doesn't, just like normal RCU updates, which also do not exclude + RCU readers. +</font></td></tr> +<tr><td> </td></tr> +</table> <p> This guarantee allows lookup code to be shared between read-side @@ -714,9 +932,20 @@ to do significant reordering. This is by design: Any significant ordering constraints would slow down these fast-path APIs. -<p><a name="Quick Quiz 8"><b>Quick Quiz 8</b>:</a> -Can't the compiler also reorder this code? -<br><a href="#qq8answer">Answer</a> +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + Can't the compiler also reorder this code? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + No, the volatile casts in <tt>READ_ONCE()</tt> and + <tt>WRITE_ONCE()</tt> prevent the compiler from reordering in + this particular case. +</font></td></tr> +<tr><td> </td></tr> +</table> <h3><a name="Readers Do Not Exclude Updaters">Readers Do Not Exclude Updaters</a></h3> @@ -769,10 +998,28 @@ new readers can start immediately after <tt>synchronize_rcu()</tt> starts, and <tt>synchronize_rcu()</tt> is under no obligation to wait for these new readers. -<p><a name="Quick Quiz 9"><b>Quick Quiz 9</b>:</a> -Suppose that synchronize_rcu() did wait until all readers had completed. -Would the updater be able to rely on this? -<br><a href="#qq9answer">Answer</a> +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + Suppose that synchronize_rcu() did wait until <i>all</i> + readers had completed instead of waiting only on + pre-existing readers. + For how long would the updater be able to rely on there + being no readers? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + For no time at all. + Even if <tt>synchronize_rcu()</tt> were to wait until + all readers had completed, a new reader might start immediately after + <tt>synchronize_rcu()</tt> completed. + Therefore, the code following + <tt>synchronize_rcu()</tt> can <i>never</i> rely on there being + no readers. +</font></td></tr> +<tr><td> </td></tr> +</table> <h3><a name="Grace Periods Don't Partition Read-Side Critical Sections"> Grace Periods Don't Partition Read-Side Critical Sections</a></h3> @@ -969,11 +1216,24 @@ grace period. As a result, an RCU read-side critical section cannot partition a pair of RCU grace periods. -<p><a name="Quick Quiz 10"><b>Quick Quiz 10</b>:</a> -How long a sequence of grace periods, each separated by an RCU read-side -critical section, would be required to partition the RCU read-side -critical sections at the beginning and end of the chain? -<br><a href="#qq10answer">Answer</a> +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + How long a sequence of grace periods, each separated by an RCU + read-side critical section, would be required to partition the RCU + read-side critical sections at the beginning and end of the chain? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + In theory, an infinite number. + In practice, an unknown number that is sensitive to both implementation + details and timing considerations. + Therefore, even in practice, RCU users must abide by the + theoretical rather than the practical answer. +</font></td></tr> +<tr><td> </td></tr> +</table> <h3><a name="Disabling Preemption Does Not Block Grace Periods"> Disabling Preemption Does Not Block Grace Periods</a></h3> @@ -1109,12 +1369,27 @@ These classes is covered in the following sections. <h3><a name="Specialization">Specialization</a></h3> <p> -RCU is and always has been intended primarily for read-mostly situations, as -illustrated by the following figure. -This means that RCU's read-side primitives are optimized, often at the +RCU is and always has been intended primarily for read-mostly situations, +which means that RCU's read-side primitives are optimized, often at the expense of its update-side primitives. +Experience thus far is captured by the following list of situations: -<p><img src="RCUApplicability.svg" alt="RCUApplicability.svg" width="70%"></p> +<ol> +<li> Read-mostly data, where stale and inconsistent data is not + a problem: RCU works great! +<li> Read-mostly data, where data must be consistent: + RCU works well. +<li> Read-write data, where data must be consistent: + RCU <i>might</i> work OK. + Or not. +<li> Write-mostly data, where data must be consistent: + RCU is very unlikely to be the right tool for the job, + with the following exceptions, where RCU can provide: + <ol type=a> + <li> Existence guarantees for update-friendly mechanisms. + <li> Wait-free read-side primitives for real-time use. + </ol> +</ol> <p> This focus on read-mostly situations means that RCU must interoperate @@ -1127,9 +1402,43 @@ synchronization primitives be legal within RCU read-side critical sections, including spinlocks, sequence locks, atomic operations, reference counters, and memory barriers. -<p><a name="Quick Quiz 11"><b>Quick Quiz 11</b>:</a> -What about sleeping locks? -<br><a href="#qq11answer">Answer</a> +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + What about sleeping locks? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + These are forbidden within Linux-kernel RCU read-side critical + sections because it is not legal to place a quiescent state + (in this case, voluntary context switch) within an RCU read-side + critical section. + However, sleeping locks may be used within userspace RCU read-side + critical sections, and also within Linux-kernel sleepable RCU + <a href="#Sleepable RCU"><font color="ffffff">(SRCU)</font></a> + read-side critical sections. + In addition, the -rt patchset turns spinlocks into a + sleeping locks so that the corresponding critical sections + can be preempted, which also means that these sleeplockified + spinlocks (but not other sleeping locks!) may be acquire within + -rt-Linux-kernel RCU read-side critical sections. + </font> + + <p><font color="ffffff"> + Note that it <i>is</i> legal for a normal RCU read-side + critical section to conditionally acquire a sleeping locks + (as in <tt>mutex_trylock()</tt>), but only as long as it does + not loop indefinitely attempting to conditionally acquire that + sleeping locks. + The key point is that things like <tt>mutex_trylock()</tt> + either return with the mutex held, or return an error indication if + the mutex was not immediately available. + Either way, <tt>mutex_trylock()</tt> returns immediately without + sleeping. +</font></td></tr> +<tr><td> </td></tr> +</table> <p> It often comes as a surprise that many algorithms do not require a @@ -1160,10 +1469,7 @@ some period of time, so the exact wait period is a judgment call. One of our pair of veternarians might wait 30 seconds before pronouncing the cat dead, while the other might insist on waiting a full minute. The two veternarians would then disagree on the state of the cat during -the final 30 seconds of the minute following the last heartbeat, as -fancifully illustrated below: - -<p><img src="2013-08-is-it-dead.png" alt="2013-08-is-it-dead.png" width="431"></p> +the final 30 seconds of the minute following the last heartbeat. <p> Interestingly enough, this same situation applies to hardware. @@ -1343,7 +1649,8 @@ situations where neither <tt>synchronize_rcu()</tt> nor <tt>synchronize_rcu_expedited()</tt> would be legal, including within preempt-disable code, <tt>local_bh_disable()</tt> code, interrupt-disable code, and interrupt handlers. -However, even <tt>call_rcu()</tt> is illegal within NMI handlers. +However, even <tt>call_rcu()</tt> is illegal within NMI handlers +and from idle and offline CPUs. The callback function (<tt>remove_gp_cb()</tt> in this case) will be executed within softirq (software interrupt) environment within the Linux kernel, @@ -1354,12 +1661,27 @@ write an RCU callback function that takes too long. Long-running operations should be relegated to separate threads or (in the Linux kernel) workqueues. -<p><a name="Quick Quiz 12"><b>Quick Quiz 12</b>:</a> -Why does line 19 use <tt>rcu_access_pointer()</tt>? -After all, <tt>call_rcu()</tt> on line 25 stores into the -structure, which would interact badly with concurrent insertions. -Doesn't this mean that <tt>rcu_dereference()</tt> is required? -<br><a href="#qq12answer">Answer</a> +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + Why does line 19 use <tt>rcu_access_pointer()</tt>? + After all, <tt>call_rcu()</tt> on line 25 stores into the + structure, which would interact badly with concurrent insertions. + Doesn't this mean that <tt>rcu_dereference()</tt> is required? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + Presumably the <tt>->gp_lock</tt> acquired on line 18 excludes + any changes, including any insertions that <tt>rcu_dereference()</tt> + would protect against. + Therefore, any insertions will be delayed until after + <tt>->gp_lock</tt> + is released on line 25, which in turn means that + <tt>rcu_access_pointer()</tt> suffices. +</font></td></tr> +<tr><td> </td></tr> +</table> <p> However, all that <tt>remove_gp_cb()</tt> is doing is @@ -1406,14 +1728,31 @@ This was due to the fact that RCU was not heavily used within DYNIX/ptx, so the very few places that needed something like <tt>synchronize_rcu()</tt> simply open-coded it. -<p><a name="Quick Quiz 13"><b>Quick Quiz 13</b>:</a> -Earlier it was claimed that <tt>call_rcu()</tt> and -<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked -by readers. -But how can that be correct, given that the invocation of the callback -and the freeing of the memory (respectively) must still wait for -a grace period to elapse? -<br><a href="#qq13answer">Answer</a> +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + Earlier it was claimed that <tt>call_rcu()</tt> and + <tt>kfree_rcu()</tt> allowed updaters to avoid being blocked + by readers. + But how can that be correct, given that the invocation of the callback + and the freeing of the memory (respectively) must still wait for + a grace period to elapse? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + We could define things this way, but keep in mind that this sort of + definition would say that updates in garbage-collected languages + cannot complete until the next time the garbage collector runs, + which does not seem at all reasonable. + The key point is that in most cases, an updater using either + <tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the + next update as soon as it has invoked <tt>call_rcu()</tt> or + <tt>kfree_rcu()</tt>, without having to wait for a subsequent + grace period. +</font></td></tr> +<tr><td> </td></tr> +</table> <p> But what if the updater must wait for the completion of code to be @@ -1838,11 +2177,26 @@ kthreads to be spawned. Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler initialization can result in deadlock. -<p><a name="Quick Quiz 14"><b>Quick Quiz 14</b>:</a> -So what happens with <tt>synchronize_rcu()</tt> during -scheduler initialization for <tt>CONFIG_PREEMPT=n</tt> -kernels? -<br><a href="#qq14answer">Answer</a> +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + So what happens with <tt>synchronize_rcu()</tt> during + scheduler initialization for <tt>CONFIG_PREEMPT=n</tt> + kernels? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt> + maps directly to <tt>synchronize_sched()</tt>. + Therefore, <tt>synchronize_rcu()</tt> works normally throughout + boot in <tt>CONFIG_PREEMPT=n</tt> kernels. + However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels, + so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt> + during scheduler initialization. +</font></td></tr> +<tr><td> </td></tr> +</table> <p> I learned of these boot-time requirements as a result of a series of @@ -2171,6 +2525,14 @@ This real-time requirement motivated the grace-period kthread, which also simplified handling of a number of race conditions. <p> +RCU must avoid degrading real-time response for CPU-bound threads, whether +executing in usermode (which is one use case for +<tt>CONFIG_NO_HZ_FULL=y</tt>) or in the kernel. +That said, CPU-bound loops in the kernel must execute +<tt>cond_resched_rcu_qs()</tt> at least once per few tens of milliseconds +in order to avoid receiving an IPI from RCU. + +<p> Finally, RCU's status as a synchronization primitive means that any RCU failure can result in arbitrary memory corruption that can be extremely difficult to debug. @@ -2223,6 +2585,8 @@ described in a separate section. <li> <a href="#Sched Flavor">Sched Flavor</a> <li> <a href="#Sleepable RCU">Sleepable RCU</a> <li> <a href="#Tasks RCU">Tasks RCU</a> +<li> <a href="#Waiting for Multiple Grace Periods"> + Waiting for Multiple Grace Periods</a> </ol> <h3><a name="Bottom-Half Flavor">Bottom-Half Flavor</a></h3> @@ -2472,6 +2836,94 @@ The tasks-RCU API is quite compact, consisting only of <tt>synchronize_rcu_tasks()</tt>, and <tt>rcu_barrier_tasks()</tt>. +<h3><a name="Waiting for Multiple Grace Periods"> +Waiting for Multiple Grace Periods</a></h3> + +<p> +Perhaps you have an RCU protected data structure that is accessed from +RCU read-side critical sections, from softirq handlers, and from +hardware interrupt handlers. +That is three flavors of RCU, the normal flavor, the bottom-half flavor, +and the sched flavor. +How to wait for a compound grace period? + +<p> +The best approach is usually to “just say no!” and +insert <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt> +around each RCU read-side critical section, regardless of what +environment it happens to be in. +But suppose that some of the RCU read-side critical sections are +on extremely hot code paths, and that use of <tt>CONFIG_PREEMPT=n</tt> +is not a viable option, so that <tt>rcu_read_lock()</tt> and +<tt>rcu_read_unlock()</tt> are not free. +What then? + +<p> +You <i>could</i> wait on all three grace periods in succession, as follows: + +<blockquote> +<pre> + 1 synchronize_rcu(); + 2 synchronize_rcu_bh(); + 3 synchronize_sched(); +</pre> +</blockquote> + +<p> +This works, but triples the update-side latency penalty. +In cases where this is not acceptable, <tt>synchronize_rcu_mult()</tt> +may be used to wait on all three flavors of grace period concurrently: + +<blockquote> +<pre> + 1 synchronize_rcu_mult(call_rcu, call_rcu_bh, call_rcu_sched); +</pre> +</blockquote> + +<p> +But what if it is necessary to also wait on SRCU? +This can be done as follows: + +<blockquote> +<pre> + 1 static void call_my_srcu(struct rcu_head *head, + 2 void (*func)(struct rcu_head *head)) + 3 { + 4 call_srcu(&my_srcu, head, func); + 5 } + 6 + 7 synchronize_rcu_mult(call_rcu, call_rcu_bh, call_rcu_sched, call_my_srcu); +</pre> +</blockquote> + +<p> +If you needed to wait on multiple different flavors of SRCU +(but why???), you would need to create a wrapper function resembling +<tt>call_my_srcu()</tt> for each SRCU flavor. + +<table> +<tr><th> </th></tr> +<tr><th align="left">Quick Quiz:</th></tr> +<tr><td> + But what if I need to wait for multiple RCU flavors, but I also need + the grace periods to be expedited? +</td></tr> +<tr><th align="left">Answer:</th></tr> +<tr><td bgcolor="#ffffff"><font color="ffffff"> + If you are using expedited grace periods, there should be less penalty + for waiting on them in succession. + But if that is nevertheless a problem, you can use workqueues + or multiple kthreads to wait on the various expedited grace + periods concurrently. +</font></td></tr> +<tr><td> </td></tr> +</table> + +<p> +Again, it is usually better to adjust the RCU read-side critical sections +to use a single flavor of RCU, but when this is not feasible, you can use +<tt>synchronize_rcu_mult()</tt>. + <h2><a name="Possible Future Changes">Possible Future Changes</a></h2> <p> @@ -2569,329 +3021,4 @@ and is provided under the terms of the Creative Commons Attribution-Share Alike 3.0 United States license. -<h3><a name="Answers to Quick Quizzes"> -Answers to Quick Quizzes</a></h3> - -<a name="qq1answer"></a> -<p><b>Quick Quiz 1</b>: -Wait a minute! -You said that updaters can make useful forward progress concurrently -with readers, but pre-existing readers will block -<tt>synchronize_rcu()</tt>!!! -Just who are you trying to fool??? - - -</p><p><b>Answer</b>: -First, if updaters do not wish to be blocked by readers, they can use -<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will -be discussed later. -Second, even when using <tt>synchronize_rcu()</tt>, the other -update-side code does run concurrently with readers, whether pre-existing -or not. - - -</p><p><a href="#Quick%20Quiz%201"><b>Back to Quick Quiz 1</b>.</a> - -<a name="qq2answer"></a> -<p><b>Quick Quiz 2</b>: -Why is the <tt>synchronize_rcu()</tt> on line 28 needed? - - -</p><p><b>Answer</b>: -Without that extra grace period, memory reordering could result in -<tt>do_something_dlm()</tt> executing <tt>do_something()</tt> -concurrently with the last bits of <tt>recovery()</tt>. - - -</p><p><a href="#Quick%20Quiz%202"><b>Back to Quick Quiz 2</b>.</a> - -<a name="qq3answer"></a> -<p><b>Quick Quiz 3</b>: -But <tt>rcu_assign_pointer()</tt> does nothing to prevent the -two assignments to <tt>p->a</tt> and <tt>p->b</tt> -from being reordered. -Can't that also cause problems? - - -</p><p><b>Answer</b>: -No, it cannot. -The readers cannot see either of these two fields until -the assignment to <tt>gp</tt>, by which time both fields are -fully initialized. -So reordering the assignments -to <tt>p->a</tt> and <tt>p->b</tt> cannot possibly -cause any problems. - - -</p><p><a href="#Quick%20Quiz%203"><b>Back to Quick Quiz 3</b>.</a> - -<a name="qq4answer"></a> -<p><b>Quick Quiz 4</b>: -Without the <tt>rcu_dereference()</tt> or the -<tt>rcu_access_pointer()</tt>, what destructive optimizations -might the compiler make use of? - - -</p><p><b>Answer</b>: -Let's start with what happens to <tt>do_something_gp()</tt> -if it fails to use <tt>rcu_dereference()</tt>. -It could reuse a value formerly fetched from this same pointer. -It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time -manner, resulting in <i>load tearing</i>, in turn resulting a bytewise -mash-up of two distince pointer values. -It might even use value-speculation optimizations, where it makes a wrong -guess, but by the time it gets around to checking the value, an update -has changed the pointer to match the wrong guess. -Too bad about any dereferences that returned pre-initialization garbage -in the meantime! - -<p> -For <tt>remove_gp_synchronous()</tt>, as long as all modifications -to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>, -the above optimizations are harmless. -However, -with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>, -<tt>sparse</tt> will complain if you -define <tt>gp</tt> with <tt>__rcu</tt> and then -access it without using -either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>. - - -</p><p><a href="#Quick%20Quiz%204"><b>Back to Quick Quiz 4</b>.</a> - -<a name="qq5answer"></a> -<p><b>Quick Quiz 5</b>: -Given that multiple CPUs can start RCU read-side critical sections -at any time without any ordering whatsoever, how can RCU possibly tell whether -or not a given RCU read-side critical section starts before a -given instance of <tt>synchronize_rcu()</tt>? - - -</p><p><b>Answer</b>: -If RCU cannot tell whether or not a given -RCU read-side critical section starts before a -given instance of <tt>synchronize_rcu()</tt>, -then it must assume that the RCU read-side critical section -started first. -In other words, a given instance of <tt>synchronize_rcu()</tt> -can avoid waiting on a given RCU read-side critical section only -if it can prove that <tt>synchronize_rcu()</tt> started first. - - -</p><p><a href="#Quick%20Quiz%205"><b>Back to Quick Quiz 5</b>.</a> - -<a name="qq6answer"></a> -<p><b>Quick Quiz 6</b>: -The first and second guarantees require unbelievably strict ordering! -Are all these memory barriers <i> really</i> required? - - -</p><p><b>Answer</b>: -Yes, they really are required. -To see why the first guarantee is required, consider the following -sequence of events: - -<ol> -<li> CPU 1: <tt>rcu_read_lock()</tt> -<li> CPU 1: <tt>q = rcu_dereference(gp); - /* Very likely to return p. */</tt> -<li> CPU 0: <tt>list_del_rcu(p);</tt> -<li> CPU 0: <tt>synchronize_rcu()</tt> starts. -<li> CPU 1: <tt>do_something_with(q->a); - /* No smp_mb(), so might happen after kfree(). */</tt> -<li> CPU 1: <tt>rcu_read_unlock()</tt> -<li> CPU 0: <tt>synchronize_rcu()</tt> returns. -<li> CPU 0: <tt>kfree(p);</tt> -</ol> - -<p> -Therefore, there absolutely must be a full memory barrier between the -end of the RCU read-side critical section and the end of the -grace period. - -<p> -The sequence of events demonstrating the necessity of the second rule -is roughly similar: - -<ol> -<li> CPU 0: <tt>list_del_rcu(p);</tt> -<li> CPU 0: <tt>synchronize_rcu()</tt> starts. -<li> CPU 1: <tt>rcu_read_lock()</tt> -<li> CPU 1: <tt>q = rcu_dereference(gp); - /* Might return p if no memory barrier. */</tt> -<li> CPU 0: <tt>synchronize_rcu()</tt> returns. -<li> CPU 0: <tt>kfree(p);</tt> -<li> CPU 1: <tt>do_something_with(q->a); /* Boom!!! */</tt> -<li> CPU 1: <tt>rcu_read_unlock()</tt> -</ol> - -<p> -And similarly, without a memory barrier between the beginning of the -grace period and the beginning of the RCU read-side critical section, -CPU 1 might end up accessing the freelist. - -<p> -The “as if” rule of course applies, so that any implementation -that acts as if the appropriate memory barriers were in place is a -correct implementation. -That said, it is much easier to fool yourself into believing that you have -adhered to the as-if rule than it is to actually adhere to it! - - -</p><p><a href="#Quick%20Quiz%206"><b>Back to Quick Quiz 6</b>.</a> - -<a name="qq7answer"></a> -<p><b>Quick Quiz 7</b>: -But how does the upgrade-to-write operation exclude other readers? - - -</p><p><b>Answer</b>: -It doesn't, just like normal RCU updates, which also do not exclude -RCU readers. - - -</p><p><a href="#Quick%20Quiz%207"><b>Back to Quick Quiz 7</b>.</a> - -<a name="qq8answer"></a> -<p><b>Quick Quiz 8</b>: -Can't the compiler also reorder this code? - - -</p><p><b>Answer</b>: -No, the volatile casts in <tt>READ_ONCE()</tt> and -<tt>WRITE_ONCE()</tt> prevent the compiler from reordering in -this particular case. - - -</p><p><a href="#Quick%20Quiz%208"><b>Back to Quick Quiz 8</b>.</a> - -<a name="qq9answer"></a> -<p><b>Quick Quiz 9</b>: -Suppose that synchronize_rcu() did wait until all readers had completed. -Would the updater be able to rely on this? - - -</p><p><b>Answer</b>: -No. -Even if <tt>synchronize_rcu()</tt> were to wait until -all readers had completed, a new reader might start immediately after -<tt>synchronize_rcu()</tt> completed. -Therefore, the code following -<tt>synchronize_rcu()</tt> cannot rely on there being no readers -in any case. - - -</p><p><a href="#Quick%20Quiz%209"><b>Back to Quick Quiz 9</b>.</a> - -<a name="qq10answer"></a> -<p><b>Quick Quiz 10</b>: -How long a sequence of grace periods, each separated by an RCU read-side -critical section, would be required to partition the RCU read-side -critical sections at the beginning and end of the chain? - - -</p><p><b>Answer</b>: -In theory, an infinite number. -In practice, an unknown number that is sensitive to both implementation -details and timing considerations. -Therefore, even in practice, RCU users must abide by the theoretical rather -than the practical answer. - - -</p><p><a href="#Quick%20Quiz%2010"><b>Back to Quick Quiz 10</b>.</a> - -<a name="qq11answer"></a> -<p><b>Quick Quiz 11</b>: -What about sleeping locks? - - -</p><p><b>Answer</b>: -These are forbidden within Linux-kernel RCU read-side critical sections -because it is not legal to place a quiescent state (in this case, -voluntary context switch) within an RCU read-side critical section. -However, sleeping locks may be used within userspace RCU read-side critical -sections, and also within Linux-kernel sleepable RCU -<a href="#Sleepable RCU">(SRCU)</a> -read-side critical sections. -In addition, the -rt patchset turns spinlocks into a sleeping locks so -that the corresponding critical sections can be preempted, which -also means that these sleeplockified spinlocks (but not other sleeping locks!) -may be acquire within -rt-Linux-kernel RCU read-side critical sections. - -<p> -Note that it <i>is</i> legal for a normal RCU read-side critical section -to conditionally acquire a sleeping locks (as in <tt>mutex_trylock()</tt>), -but only as long as it does not loop indefinitely attempting to -conditionally acquire that sleeping locks. -The key point is that things like <tt>mutex_trylock()</tt> -either return with the mutex held, or return an error indication if -the mutex was not immediately available. -Either way, <tt>mutex_trylock()</tt> returns immediately without sleeping. - - -</p><p><a href="#Quick%20Quiz%2011"><b>Back to Quick Quiz 11</b>.</a> - -<a name="qq12answer"></a> -<p><b>Quick Quiz 12</b>: -Why does line 19 use <tt>rcu_access_pointer()</tt>? -After all, <tt>call_rcu()</tt> on line 25 stores into the -structure, which would interact badly with concurrent insertions. -Doesn't this mean that <tt>rcu_dereference()</tt> is required? - - -</p><p><b>Answer</b>: -Presumably the <tt>->gp_lock</tt> acquired on line 18 excludes -any changes, including any insertions that <tt>rcu_dereference()</tt> -would protect against. -Therefore, any insertions will be delayed until after <tt>->gp_lock</tt> -is released on line 25, which in turn means that -<tt>rcu_access_pointer()</tt> suffices. - - -</p><p><a href="#Quick%20Quiz%2012"><b>Back to Quick Quiz 12</b>.</a> - -<a name="qq13answer"></a> -<p><b>Quick Quiz 13</b>: -Earlier it was claimed that <tt>call_rcu()</tt> and -<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked -by readers. -But how can that be correct, given that the invocation of the callback -and the freeing of the memory (respectively) must still wait for -a grace period to elapse? - - -</p><p><b>Answer</b>: -We could define things this way, but keep in mind that this sort of -definition would say that updates in garbage-collected languages -cannot complete until the next time the garbage collector runs, -which does not seem at all reasonable. -The key point is that in most cases, an updater using either -<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the -next update as soon as it has invoked <tt>call_rcu()</tt> or -<tt>kfree_rcu()</tt>, without having to wait for a subsequent -grace period. - - -</p><p><a href="#Quick%20Quiz%2013"><b>Back to Quick Quiz 13</b>.</a> - -<a name="qq14answer"></a> -<p><b>Quick Quiz 14</b>: -So what happens with <tt>synchronize_rcu()</tt> during -scheduler initialization for <tt>CONFIG_PREEMPT=n</tt> -kernels? - - -</p><p><b>Answer</b>: -In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt> -maps directly to <tt>synchronize_sched()</tt>. -Therefore, <tt>synchronize_rcu()</tt> works normally throughout -boot in <tt>CONFIG_PREEMPT=n</tt> kernels. -However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels, -so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt> -during scheduler initialization. - - -</p><p><a href="#Quick%20Quiz%2014"><b>Back to Quick Quiz 14</b>.</a> - - </body></html> diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx deleted file mode 100644 index 3a97ba490c42..000000000000 --- a/Documentation/RCU/Design/Requirements/Requirements.htmlx +++ /dev/null @@ -1,2741 +0,0 @@ -<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" - "http://www.w3.org/TR/html4/loose.dtd"> - <html> - <head><title>A Tour Through RCU's Requirements [LWN.net]</title> - <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf-8"> - -<h1>A Tour Through RCU's Requirements</h1> - -<p>Copyright IBM Corporation, 2015</p> -<p>Author: Paul E. McKenney</p> -<p><i>The initial version of this document appeared in the -<a href="https://lwn.net/">LWN</a> articles -<a href="https://lwn.net/Articles/652156/">here</a>, -<a href="https://lwn.net/Articles/652677/">here</a>, and -<a href="https://lwn.net/Articles/653326/">here</a>.</i></p> - -<h2>Introduction</h2> - -<p> -Read-copy update (RCU) is a synchronization mechanism that is often -used as a replacement for reader-writer locking. -RCU is unusual in that updaters do not block readers, -which means that RCU's read-side primitives can be exceedingly fast -and scalable. -In addition, updaters can make useful forward progress concurrently -with readers. -However, all this concurrency between RCU readers and updaters does raise -the question of exactly what RCU readers are doing, which in turn -raises the question of exactly what RCU's requirements are. - -<p> -This document therefore summarizes RCU's requirements, and can be thought -of as an informal, high-level specification for RCU. -It is important to understand that RCU's specification is primarily -empirical in nature; -in fact, I learned about many of these requirements the hard way. -This situation might cause some consternation, however, not only -has this learning process been a lot of fun, but it has also been -a great privilege to work with so many people willing to apply -technologies in interesting new ways. - -<p> -All that aside, here are the categories of currently known RCU requirements: -</p> - -<ol> -<li> <a href="#Fundamental Requirements"> - Fundamental Requirements</a> -<li> <a href="#Fundamental Non-Requirements">Fundamental Non-Requirements</a> -<li> <a href="#Parallelism Facts of Life"> - Parallelism Facts of Life</a> -<li> <a href="#Quality-of-Implementation Requirements"> - Quality-of-Implementation Requirements</a> -<li> <a href="#Linux Kernel Complications"> - Linux Kernel Complications</a> -<li> <a href="#Software-Engineering Requirements"> - Software-Engineering Requirements</a> -<li> <a href="#Other RCU Flavors"> - Other RCU Flavors</a> -<li> <a href="#Possible Future Changes"> - Possible Future Changes</a> -</ol> - -<p> -This is followed by a <a href="#Summary">summary</a>, -which is in turn followed by the inevitable -<a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>. - -<h2><a name="Fundamental Requirements">Fundamental Requirements</a></h2> - -<p> -RCU's fundamental requirements are the closest thing RCU has to hard -mathematical requirements. -These are: - -<ol> -<li> <a href="#Grace-Period Guarantee"> - Grace-Period Guarantee</a> -<li> <a href="#Publish-Subscribe Guarantee"> - Publish-Subscribe Guarantee</a> -<li> <a href="#Memory-Barrier Guarantees"> - Memory-Barrier Guarantees</a> -<li> <a href="#RCU Primitives Guaranteed to Execute Unconditionally"> - RCU Primitives Guaranteed to Execute Unconditionally</a> -<li> <a href="#Guaranteed Read-to-Write Upgrade"> - Guaranteed Read-to-Write Upgrade</a> -</ol> - -<h3><a name="Grace-Period Guarantee">Grace-Period Guarantee</a></h3> - -<p> -RCU's grace-period guarantee is unusual in being premeditated: -Jack Slingwine and I had this guarantee firmly in mind when we started -work on RCU (then called “rclock”) in the early 1990s. -That said, the past two decades of experience with RCU have produced -a much more detailed understanding of this guarantee. - -<p> -RCU's grace-period guarantee allows updaters to wait for the completion -of all pre-existing RCU read-side critical sections. -An RCU read-side critical section -begins with the marker <tt>rcu_read_lock()</tt> and ends with -the marker <tt>rcu_read_unlock()</tt>. -These markers may be nested, and RCU treats a nested set as one -big RCU read-side critical section. -Production-quality implementations of <tt>rcu_read_lock()</tt> and -<tt>rcu_read_unlock()</tt> are extremely lightweight, and in -fact have exactly zero overhead in Linux kernels built for production -use with <tt>CONFIG_PREEMPT=n</tt>. - -<p> -This guarantee allows ordering to be enforced with extremely low -overhead to readers, for example: - -<blockquote> -<pre> - 1 int x, y; - 2 - 3 void thread0(void) - 4 { - 5 rcu_read_lock(); - 6 r1 = READ_ONCE(x); - 7 r2 = READ_ONCE(y); - 8 rcu_read_unlock(); - 9 } -10 -11 void thread1(void) -12 { -13 WRITE_ONCE(x, 1); -14 synchronize_rcu(); -15 WRITE_ONCE(y, 1); -16 } -</pre> -</blockquote> - -<p> -Because the <tt>synchronize_rcu()</tt> on line 14 waits for -all pre-existing readers, any instance of <tt>thread0()</tt> that -loads a value of zero from <tt>x</tt> must complete before -<tt>thread1()</tt> stores to <tt>y</tt>, so that instance must -also load a value of zero from <tt>y</tt>. -Similarly, any instance of <tt>thread0()</tt> that loads a value of -one from <tt>y</tt> must have started after the -<tt>synchronize_rcu()</tt> started, and must therefore also load -a value of one from <tt>x</tt>. -Therefore, the outcome: -<blockquote> -<pre> -(r1 == 0 && r2 == 1) -</pre> -</blockquote> -cannot happen. - -<p>@@QQ@@ -Wait a minute! -You said that updaters can make useful forward progress concurrently -with readers, but pre-existing readers will block -<tt>synchronize_rcu()</tt>!!! -Just who are you trying to fool??? -<p>@@QQA@@ -First, if updaters do not wish to be blocked by readers, they can use -<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will -be discussed later. -Second, even when using <tt>synchronize_rcu()</tt>, the other -update-side code does run concurrently with readers, whether pre-existing -or not. -<p>@@QQE@@ - -<p> -This scenario resembles one of the first uses of RCU in -<a href="https://en.wikipedia.org/wiki/DYNIX">DYNIX/ptx</a>, -which managed a distributed lock manager's transition into -a state suitable for handling recovery from node failure, -more or less as follows: - -<blockquote> -<pre> - 1 #define STATE_NORMAL 0 - 2 #define STATE_WANT_RECOVERY 1 - 3 #define STATE_RECOVERING 2 - 4 #define STATE_WANT_NORMAL 3 - 5 - 6 int state = STATE_NORMAL; - 7 - 8 void do_something_dlm(void) - 9 { -10 int state_snap; -11 -12 rcu_read_lock(); -13 state_snap = READ_ONCE(state); -14 if (state_snap == STATE_NORMAL) -15 do_something(); -16 else -17 do_something_carefully(); -18 rcu_read_unlock(); -19 } -20 -21 void start_recovery(void) -22 { -23 WRITE_ONCE(state, STATE_WANT_RECOVERY); -24 synchronize_rcu(); -25 WRITE_ONCE(state, STATE_RECOVERING); -26 recovery(); -27 WRITE_ONCE(state, STATE_WANT_NORMAL); -28 synchronize_rcu(); -29 WRITE_ONCE(state, STATE_NORMAL); -30 } -</pre> -</blockquote> - -<p> -The RCU read-side critical section in <tt>do_something_dlm()</tt> -works with the <tt>synchronize_rcu()</tt> in <tt>start_recovery()</tt> -to guarantee that <tt>do_something()</tt> never runs concurrently -with <tt>recovery()</tt>, but with little or no synchronization -overhead in <tt>do_something_dlm()</tt>. - -<p>@@QQ@@ -Why is the <tt>synchronize_rcu()</tt> on line 28 needed? -<p>@@QQA@@ -Without that extra grace period, memory reordering could result in -<tt>do_something_dlm()</tt> executing <tt>do_something()</tt> -concurrently with the last bits of <tt>recovery()</tt>. -<p>@@QQE@@ - -<p> -In order to avoid fatal problems such as deadlocks, -an RCU read-side critical section must not contain calls to -<tt>synchronize_rcu()</tt>. -Similarly, an RCU read-side critical section must not -contain anything that waits, directly or indirectly, on completion of -an invocation of <tt>synchronize_rcu()</tt>. - -<p> -Although RCU's grace-period guarantee is useful in and of itself, with -<a href="https://lwn.net/Articles/573497/">quite a few use cases</a>, -it would be good to be able to use RCU to coordinate read-side -access to linked data structures. -For this, the grace-period guarantee is not sufficient, as can -be seen in function <tt>add_gp_buggy()</tt> below. -We will look at the reader's code later, but in the meantime, just think of -the reader as locklessly picking up the <tt>gp</tt> pointer, -and, if the value loaded is non-<tt>NULL</tt>, locklessly accessing the -<tt>->a</tt> and <tt>->b</tt> fields. - -<blockquote> -<pre> - 1 bool add_gp_buggy(int a, int b) - 2 { - 3 p = kmalloc(sizeof(*p), GFP_KERNEL); - 4 if (!p) - 5 return -ENOMEM; - 6 spin_lock(&gp_lock); - 7 if (rcu_access_pointer(gp)) { - 8 spin_unlock(&gp_lock); - 9 return false; -10 } -11 p->a = a; -12 p->b = a; -13 gp = p; /* ORDERING BUG */ -14 spin_unlock(&gp_lock); -15 return true; -16 } -</pre> -</blockquote> - -<p> -The problem is that both the compiler and weakly ordered CPUs are within -their rights to reorder this code as follows: - -<blockquote> -<pre> - 1 bool add_gp_buggy_optimized(int a, int b) - 2 { - 3 p = kmalloc(sizeof(*p), GFP_KERNEL); - 4 if (!p) - 5 return -ENOMEM; - 6 spin_lock(&gp_lock); - 7 if (rcu_access_pointer(gp)) { - 8 spin_unlock(&gp_lock); - 9 return false; -10 } -<b>11 gp = p; /* ORDERING BUG */ -12 p->a = a; -13 p->b = a;</b> -14 spin_unlock(&gp_lock); -15 return true; -16 } -</pre> -</blockquote> - -<p> -If an RCU reader fetches <tt>gp</tt> just after -<tt>add_gp_buggy_optimized</tt> executes line 11, -it will see garbage in the <tt>->a</tt> and <tt>->b</tt> -fields. -And this is but one of many ways in which compiler and hardware optimizations -could cause trouble. -Therefore, we clearly need some way to prevent the compiler and the CPU from -reordering in this manner, which brings us to the publish-subscribe -guarantee discussed in the next section. - -<h3><a name="Publish-Subscribe Guarantee">Publish/Subscribe Guarantee</a></h3> - -<p> -RCU's publish-subscribe guarantee allows data to be inserted -into a linked data structure without disrupting RCU readers. -The updater uses <tt>rcu_assign_pointer()</tt> to insert the -new data, and readers use <tt>rcu_dereference()</tt> to -access data, whether new or old. -The following shows an example of insertion: - -<blockquote> -<pre> - 1 bool add_gp(int a, int b) - 2 { - 3 p = kmalloc(sizeof(*p), GFP_KERNEL); - 4 if (!p) - 5 return -ENOMEM; - 6 spin_lock(&gp_lock); - 7 if (rcu_access_pointer(gp)) { - 8 spin_unlock(&gp_lock); - 9 return false; -10 } -11 p->a = a; -12 p->b = a; -13 rcu_assign_pointer(gp, p); -14 spin_unlock(&gp_lock); -15 return true; -16 } -</pre> -</blockquote> - -<p> -The <tt>rcu_assign_pointer()</tt> on line 13 is conceptually -equivalent to a simple assignment statement, but also guarantees -that its assignment will -happen after the two assignments in lines 11 and 12, -similar to the C11 <tt>memory_order_release</tt> store operation. -It also prevents any number of “interesting” compiler -optimizations, for example, the use of <tt>gp</tt> as a scratch -location immediately preceding the assignment. - -<p>@@QQ@@ -But <tt>rcu_assign_pointer()</tt> does nothing to prevent the -two assignments to <tt>p->a</tt> and <tt>p->b</tt> -from being reordered. -Can't that also cause problems? -<p>@@QQA@@ -No, it cannot. -The readers cannot see either of these two fields until -the assignment to <tt>gp</tt>, by which time both fields are -fully initialized. -So reordering the assignments -to <tt>p->a</tt> and <tt>p->b</tt> cannot possibly -cause any problems. -<p>@@QQE@@ - -<p> -It is tempting to assume that the reader need not do anything special -to control its accesses to the RCU-protected data, -as shown in <tt>do_something_gp_buggy()</tt> below: - -<blockquote> -<pre> - 1 bool do_something_gp_buggy(void) - 2 { - 3 rcu_read_lock(); - 4 p = gp; /* OPTIMIZATIONS GALORE!!! */ - 5 if (p) { - 6 do_something(p->a, p->b); - 7 rcu_read_unlock(); - 8 return true; - 9 } -10 rcu_read_unlock(); -11 return false; -12 } -</pre> -</blockquote> - -<p> -However, this temptation must be resisted because there are a -surprisingly large number of ways that the compiler -(to say nothing of -<a href="https://h71000.www7.hp.com/wizard/wiz_2637.html">DEC Alpha CPUs</a>) -can trip this code up. -For but one example, if the compiler were short of registers, it -might choose to refetch from <tt>gp</tt> rather than keeping -a separate copy in <tt>p</tt> as follows: - -<blockquote> -<pre> - 1 bool do_something_gp_buggy_optimized(void) - 2 { - 3 rcu_read_lock(); - 4 if (gp) { /* OPTIMIZATIONS GALORE!!! */ -<b> 5 do_something(gp->a, gp->b);</b> - 6 rcu_read_unlock(); - 7 return true; - 8 } - 9 rcu_read_unlock(); -10 return false; -11 } -</pre> -</blockquote> - -<p> -If this function ran concurrently with a series of updates that -replaced the current structure with a new one, -the fetches of <tt>gp->a</tt> -and <tt>gp->b</tt> might well come from two different structures, -which could cause serious confusion. -To prevent this (and much else besides), <tt>do_something_gp()</tt> uses -<tt>rcu_dereference()</tt> to fetch from <tt>gp</tt>: - -<blockquote> -<pre> - 1 bool do_something_gp(void) - 2 { - 3 rcu_read_lock(); - 4 p = rcu_dereference(gp); - 5 if (p) { - 6 do_something(p->a, p->b); - 7 rcu_read_unlock(); - 8 return true; - 9 } -10 rcu_read_unlock(); -11 return false; -12 } -</pre> -</blockquote> - -<p> -The <tt>rcu_dereference()</tt> uses volatile casts and (for DEC Alpha) -memory barriers in the Linux kernel. -Should a -<a href="http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf">high-quality implementation of C11 <tt>memory_order_consume</tt> [PDF]</a> -ever appear, then <tt>rcu_dereference()</tt> could be implemented -as a <tt>memory_order_consume</tt> load. -Regardless of the exact implementation, a pointer fetched by -<tt>rcu_dereference()</tt> may not be used outside of the -outermost RCU read-side critical section containing that -<tt>rcu_dereference()</tt>, unless protection of -the corresponding data element has been passed from RCU to some -other synchronization mechanism, most commonly locking or -<a href="https://www.kernel.org/doc/Documentation/RCU/rcuref.txt">reference counting</a>. - -<p> -In short, updaters use <tt>rcu_assign_pointer()</tt> and readers -use <tt>rcu_dereference()</tt>, and these two RCU API elements -work together to ensure that readers have a consistent view of -newly added data elements. - -<p> -Of course, it is also necessary to remove elements from RCU-protected -data structures, for example, using the following process: - -<ol> -<li> Remove the data element from the enclosing structure. -<li> Wait for all pre-existing RCU read-side critical sections - to complete (because only pre-existing readers can possibly have - a reference to the newly removed data element). -<li> At this point, only the updater has a reference to the - newly removed data element, so it can safely reclaim - the data element, for example, by passing it to <tt>kfree()</tt>. -</ol> - -This process is implemented by <tt>remove_gp_synchronous()</tt>: - -<blockquote> -<pre> - 1 bool remove_gp_synchronous(void) - 2 { - 3 struct foo *p; - 4 - 5 spin_lock(&gp_lock); - 6 p = rcu_access_pointer(gp); - 7 if (!p) { - 8 spin_unlock(&gp_lock); - 9 return false; -10 } -11 rcu_assign_pointer(gp, NULL); -12 spin_unlock(&gp_lock); -13 synchronize_rcu(); -14 kfree(p); -15 return true; -16 } -</pre> -</blockquote> - -<p> -This function is straightforward, with line 13 waiting for a grace -period before line 14 frees the old data element. -This waiting ensures that readers will reach line 7 of -<tt>do_something_gp()</tt> before the data element referenced by -<tt>p</tt> is freed. -The <tt>rcu_access_pointer()</tt> on line 6 is similar to -<tt>rcu_dereference()</tt>, except that: - -<ol> -<li> The value returned by <tt>rcu_access_pointer()</tt> - cannot be dereferenced. - If you want to access the value pointed to as well as - the pointer itself, use <tt>rcu_dereference()</tt> - instead of <tt>rcu_access_pointer()</tt>. -<li> The call to <tt>rcu_access_pointer()</tt> need not be - protected. - In contrast, <tt>rcu_dereference()</tt> must either be - within an RCU read-side critical section or in a code - segment where the pointer cannot change, for example, in - code protected by the corresponding update-side lock. -</ol> - -<p>@@QQ@@ -Without the <tt>rcu_dereference()</tt> or the -<tt>rcu_access_pointer()</tt>, what destructive optimizations -might the compiler make use of? -<p>@@QQA@@ -Let's start with what happens to <tt>do_something_gp()</tt> -if it fails to use <tt>rcu_dereference()</tt>. -It could reuse a value formerly fetched from this same pointer. -It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time -manner, resulting in <i>load tearing</i>, in turn resulting a bytewise -mash-up of two distince pointer values. -It might even use value-speculation optimizations, where it makes a wrong -guess, but by the time it gets around to checking the value, an update -has changed the pointer to match the wrong guess. -Too bad about any dereferences that returned pre-initialization garbage -in the meantime! - -<p> -For <tt>remove_gp_synchronous()</tt>, as long as all modifications -to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>, -the above optimizations are harmless. -However, -with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>, -<tt>sparse</tt> will complain if you -define <tt>gp</tt> with <tt>__rcu</tt> and then -access it without using -either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>. -<p>@@QQE@@ - -<p> -In short, RCU's publish-subscribe guarantee is provided by the combination -of <tt>rcu_assign_pointer()</tt> and <tt>rcu_dereference()</tt>. -This guarantee allows data elements to be safely added to RCU-protected -linked data structures without disrupting RCU readers. -This guarantee can be used in combination with the grace-period -guarantee to also allow data elements to be removed from RCU-protected -linked data structures, again without disrupting RCU readers. - -<p> -This guarantee was only partially premeditated. -DYNIX/ptx used an explicit memory barrier for publication, but had nothing -resembling <tt>rcu_dereference()</tt> for subscription, nor did it -have anything resembling the <tt>smp_read_barrier_depends()</tt> -that was later subsumed into <tt>rcu_dereference()</tt>. -The need for these operations made itself known quite suddenly at a -late-1990s meeting with the DEC Alpha architects, back in the days when -DEC was still a free-standing company. -It took the Alpha architects a good hour to convince me that any sort -of barrier would ever be needed, and it then took me a good <i>two</i> hours -to convince them that their documentation did not make this point clear. -More recent work with the C and C++ standards committees have provided -much education on tricks and traps from the compiler. -In short, compilers were much less tricky in the early 1990s, but in -2015, don't even think about omitting <tt>rcu_dereference()</tt>! - -<h3><a name="Memory-Barrier Guarantees">Memory-Barrier Guarantees</a></h3> - -<p> -The previous section's simple linked-data-structure scenario clearly -demonstrates the need for RCU's stringent memory-ordering guarantees on -systems with more than one CPU: - -<ol> -<li> Each CPU that has an RCU read-side critical section that - begins before <tt>synchronize_rcu()</tt> starts is - guaranteed to execute a full memory barrier between the time - that the RCU read-side critical section ends and the time that - <tt>synchronize_rcu()</tt> returns. - Without this guarantee, a pre-existing RCU read-side critical section - might hold a reference to the newly removed <tt>struct foo</tt> - after the <tt>kfree()</tt> on line 14 of - <tt>remove_gp_synchronous()</tt>. -<li> Each CPU that has an RCU read-side critical section that ends - after <tt>synchronize_rcu()</tt> returns is guaranteed - to execute a full memory barrier between the time that - <tt>synchronize_rcu()</tt> begins and the time that the RCU - read-side critical section begins. - Without this guarantee, a later RCU read-side critical section - running after the <tt>kfree()</tt> on line 14 of - <tt>remove_gp_synchronous()</tt> might - later run <tt>do_something_gp()</tt> and find the - newly deleted <tt>struct foo</tt>. -<li> If the task invoking <tt>synchronize_rcu()</tt> remains - on a given CPU, then that CPU is guaranteed to execute a full - memory barrier sometime during the execution of - <tt>synchronize_rcu()</tt>. - This guarantee ensures that the <tt>kfree()</tt> on - line 14 of <tt>remove_gp_synchronous()</tt> really does - execute after the removal on line 11. -<li> If the task invoking <tt>synchronize_rcu()</tt> migrates - among a group of CPUs during that invocation, then each of the - CPUs in that group is guaranteed to execute a full memory barrier - sometime during the execution of <tt>synchronize_rcu()</tt>. - This guarantee also ensures that the <tt>kfree()</tt> on - line 14 of <tt>remove_gp_synchronous()</tt> really does - execute after the removal on - line 11, but also in the case where the thread executing the - <tt>synchronize_rcu()</tt> migrates in the meantime. -</ol> - -<p>@@QQ@@ -Given that multiple CPUs can start RCU read-side critical sections -at any time without any ordering whatsoever, how can RCU possibly tell whether -or not a given RCU read-side critical section starts before a -given instance of <tt>synchronize_rcu()</tt>? -<p>@@QQA@@ -If RCU cannot tell whether or not a given -RCU read-side critical section starts before a -given instance of <tt>synchronize_rcu()</tt>, -then it must assume that the RCU read-side critical section -started first. -In other words, a given instance of <tt>synchronize_rcu()</tt> -can avoid waiting on a given RCU read-side critical section only -if it can prove that <tt>synchronize_rcu()</tt> started first. -<p>@@QQE@@ - -<p>@@QQ@@ -The first and second guarantees require unbelievably strict ordering! -Are all these memory barriers <i> really</i> required? -<p>@@QQA@@ -Yes, they really are required. -To see why the first guarantee is required, consider the following -sequence of events: - -<ol> -<li> CPU 1: <tt>rcu_read_lock()</tt> -<li> CPU 1: <tt>q = rcu_dereference(gp); - /* Very likely to return p. */</tt> -<li> CPU 0: <tt>list_del_rcu(p);</tt> -<li> CPU 0: <tt>synchronize_rcu()</tt> starts. -<li> CPU 1: <tt>do_something_with(q->a); - /* No smp_mb(), so might happen after kfree(). */</tt> -<li> CPU 1: <tt>rcu_read_unlock()</tt> -<li> CPU 0: <tt>synchronize_rcu()</tt> returns. -<li> CPU 0: <tt>kfree(p);</tt> -</ol> - -<p> -Therefore, there absolutely must be a full memory barrier between the -end of the RCU read-side critical section and the end of the -grace period. - -<p> -The sequence of events demonstrating the necessity of the second rule -is roughly similar: - -<ol> -<li> CPU 0: <tt>list_del_rcu(p);</tt> -<li> CPU 0: <tt>synchronize_rcu()</tt> starts. -<li> CPU 1: <tt>rcu_read_lock()</tt> -<li> CPU 1: <tt>q = rcu_dereference(gp); - /* Might return p if no memory barrier. */</tt> -<li> CPU 0: <tt>synchronize_rcu()</tt> returns. -<li> CPU 0: <tt>kfree(p);</tt> -<li> CPU 1: <tt>do_something_with(q->a); /* Boom!!! */</tt> -<li> CPU 1: <tt>rcu_read_unlock()</tt> -</ol> - -<p> -And similarly, without a memory barrier between the beginning of the -grace period and the beginning of the RCU read-side critical section, -CPU 1 might end up accessing the freelist. - -<p> -The “as if” rule of course applies, so that any implementation -that acts as if the appropriate memory barriers were in place is a -correct implementation. -That said, it is much easier to fool yourself into believing that you have -adhered to the as-if rule than it is to actually adhere to it! -<p>@@QQE@@ - -<p> -Note that these memory-barrier requirements do not replace the fundamental -RCU requirement that a grace period wait for all pre-existing readers. -On the contrary, the memory barriers called out in this section must operate in -such a way as to <i>enforce</i> this fundamental requirement. -Of course, different implementations enforce this requirement in different -ways, but enforce it they must. - -<h3><a name="RCU Primitives Guaranteed to Execute Unconditionally">RCU Primitives Guaranteed to Execute Unconditionally</a></h3> - -<p> -The common-case RCU primitives are unconditional. -They are invoked, they do their job, and they return, with no possibility -of error, and no need to retry. -This is a key RCU design philosophy. - -<p> -However, this philosophy is pragmatic rather than pigheaded. -If someone comes up with a good justification for a particular conditional -RCU primitive, it might well be implemented and added. -After all, this guarantee was reverse-engineered, not premeditated. -The unconditional nature of the RCU primitives was initially an -accident of implementation, and later experience with synchronization -primitives with conditional primitives caused me to elevate this -accident to a guarantee. -Therefore, the justification for adding a conditional primitive to -RCU would need to be based on detailed and compelling use cases. - -<h3><a name="Guaranteed Read-to-Write Upgrade">Guaranteed Read-to-Write Upgrade</a></h3> - -<p> -As far as RCU is concerned, it is always possible to carry out an -update within an RCU read-side critical section. -For example, that RCU read-side critical section might search for -a given data element, and then might acquire the update-side -spinlock in order to update that element, all while remaining -in that RCU read-side critical section. -Of course, it is necessary to exit the RCU read-side critical section -before invoking <tt>synchronize_rcu()</tt>, however, this -inconvenience can be avoided through use of the -<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt> API members -described later in this document. - -<p>@@QQ@@ -But how does the upgrade-to-write operation exclude other readers? -<p>@@QQA@@ -It doesn't, just like normal RCU updates, which also do not exclude -RCU readers. -<p>@@QQE@@ - -<p> -This guarantee allows lookup code to be shared between read-side -and update-side code, and was premeditated, appearing in the earliest -DYNIX/ptx RCU documentation. - -<h2><a name="Fundamental Non-Requirements">Fundamental Non-Requirements</a></h2> - -<p> -RCU provides extremely lightweight readers, and its read-side guarantees, -though quite useful, are correspondingly lightweight. -It is therefore all too easy to assume that RCU is guaranteeing more -than it really is. -Of course, the list of things that RCU does not guarantee is infinitely -long, however, the following sections list a few non-guarantees that -have caused confusion. -Except where otherwise noted, these non-guarantees were premeditated. - -<ol> -<li> <a href="#Readers Impose Minimal Ordering"> - Readers Impose Minimal Ordering</a> -<li> <a href="#Readers Do Not Exclude Updaters"> - Readers Do Not Exclude Updaters</a> -<li> <a href="#Updaters Only Wait For Old Readers"> - Updaters Only Wait For Old Readers</a> -<li> <a href="#Grace Periods Don't Partition Read-Side Critical Sections"> - Grace Periods Don't Partition Read-Side Critical Sections</a> -<li> <a href="#Read-Side Critical Sections Don't Partition Grace Periods"> - Read-Side Critical Sections Don't Partition Grace Periods</a> -<li> <a href="#Disabling Preemption Does Not Block Grace Periods"> - Disabling Preemption Does Not Block Grace Periods</a> -</ol> - -<h3><a name="Readers Impose Minimal Ordering">Readers Impose Minimal Ordering</a></h3> - -<p> -Reader-side markers such as <tt>rcu_read_lock()</tt> and -<tt>rcu_read_unlock()</tt> provide absolutely no ordering guarantees -except through their interaction with the grace-period APIs such as -<tt>synchronize_rcu()</tt>. -To see this, consider the following pair of threads: - -<blockquote> -<pre> - 1 void thread0(void) - 2 { - 3 rcu_read_lock(); - 4 WRITE_ONCE(x, 1); - 5 rcu_read_unlock(); - 6 rcu_read_lock(); - 7 WRITE_ONCE(y, 1); - 8 rcu_read_unlock(); - 9 } -10 -11 void thread1(void) -12 { -13 rcu_read_lock(); -14 r1 = READ_ONCE(y); -15 rcu_read_unlock(); -16 rcu_read_lock(); -17 r2 = READ_ONCE(x); -18 rcu_read_unlock(); -19 } -</pre> -</blockquote> - -<p> -After <tt>thread0()</tt> and <tt>thread1()</tt> execute -concurrently, it is quite possible to have - -<blockquote> -<pre> -(r1 == 1 && r2 == 0) -</pre> -</blockquote> - -(that is, <tt>y</tt> appears to have been assigned before <tt>x</tt>), -which would not be possible if <tt>rcu_read_lock()</tt> and -<tt>rcu_read_unlock()</tt> had much in the way of ordering -properties. -But they do not, so the CPU is within its rights -to do significant reordering. -This is by design: Any significant ordering constraints would slow down -these fast-path APIs. - -<p>@@QQ@@ -Can't the compiler also reorder this code? -<p>@@QQA@@ -No, the volatile casts in <tt>READ_ONCE()</tt> and -<tt>WRITE_ONCE()</tt> prevent the compiler from reordering in -this particular case. -<p>@@QQE@@ - -<h3><a name="Readers Do Not Exclude Updaters">Readers Do Not Exclude Updaters</a></h3> - -<p> -Neither <tt>rcu_read_lock()</tt> nor <tt>rcu_read_unlock()</tt> -exclude updates. -All they do is to prevent grace periods from ending. -The following example illustrates this: - -<blockquote> -<pre> - 1 void thread0(void) - 2 { - 3 rcu_read_lock(); - 4 r1 = READ_ONCE(y); - 5 if (r1) { - 6 do_something_with_nonzero_x(); - 7 r2 = READ_ONCE(x); - 8 WARN_ON(!r2); /* BUG!!! */ - 9 } -10 rcu_read_unlock(); -11 } -12 -13 void thread1(void) -14 { -15 spin_lock(&my_lock); -16 WRITE_ONCE(x, 1); -17 WRITE_ONCE(y, 1); -18 spin_unlock(&my_lock); -19 } -</pre> -</blockquote> - -<p> -If the <tt>thread0()</tt> function's <tt>rcu_read_lock()</tt> -excluded the <tt>thread1()</tt> function's update, -the <tt>WARN_ON()</tt> could never fire. -But the fact is that <tt>rcu_read_lock()</tt> does not exclude -much of anything aside from subsequent grace periods, of which -<tt>thread1()</tt> has none, so the -<tt>WARN_ON()</tt> can and does fire. - -<h3><a name="Updaters Only Wait For Old Readers">Updaters Only Wait For Old Readers</a></h3> - -<p> -It might be tempting to assume that after <tt>synchronize_rcu()</tt> -completes, there are no readers executing. -This temptation must be avoided because -new readers can start immediately after <tt>synchronize_rcu()</tt> -starts, and <tt>synchronize_rcu()</tt> is under no -obligation to wait for these new readers. - -<p>@@QQ@@ -Suppose that synchronize_rcu() did wait until all readers had completed. -Would the updater be able to rely on this? -<p>@@QQA@@ -No. -Even if <tt>synchronize_rcu()</tt> were to wait until -all readers had completed, a new reader might start immediately after -<tt>synchronize_rcu()</tt> completed. -Therefore, the code following -<tt>synchronize_rcu()</tt> cannot rely on there being no readers -in any case. -<p>@@QQE@@ - -<h3><a name="Grace Periods Don't Partition Read-Side Critical Sections"> -Grace Periods Don't Partition Read-Side Critical Sections</a></h3> - -<p> -It is tempting to assume that if any part of one RCU read-side critical -section precedes a given grace period, and if any part of another RCU -read-side critical section follows that same grace period, then all of -the first RCU read-side critical section must precede all of the second. -However, this just isn't the case: A single grace period does not -partition the set of RCU read-side critical sections. -An example of this situation can be illustrated as follows, where -<tt>x</tt>, <tt>y</tt>, and <tt>z</tt> are initially all zero: - -<blockquote> -<pre> - 1 void thread0(void) - 2 { - 3 rcu_read_lock(); - 4 WRITE_ONCE(a, 1); - 5 WRITE_ONCE(b, 1); - 6 rcu_read_unlock(); - 7 } - 8 - 9 void thread1(void) -10 { -11 r1 = READ_ONCE(a); -12 synchronize_rcu(); -13 WRITE_ONCE(c, 1); -14 } -15 -16 void thread2(void) -17 { -18 rcu_read_lock(); -19 r2 = READ_ONCE(b); -20 r3 = READ_ONCE(c); -21 rcu_read_unlock(); -22 } -</pre> -</blockquote> - -<p> -It turns out that the outcome: - -<blockquote> -<pre> -(r1 == 1 && r2 == 0 && r3 == 1) -</pre> -</blockquote> - -is entirely possible. -The following figure show how this can happen, with each circled -<tt>QS</tt> indicating the point at which RCU recorded a -<i>quiescent state</i> for each thread, that is, a state in which -RCU knows that the thread cannot be in the midst of an RCU read-side -critical section that started before the current grace period: - -<p><img src="GPpartitionReaders1.svg" alt="GPpartitionReaders1.svg" width="60%"></p> - -<p> -If it is necessary to partition RCU read-side critical sections in this -manner, it is necessary to use two grace periods, where the first -grace period is known to end before the second grace period starts: - -<blockquote> -<pre> - 1 void thread0(void) - 2 { - 3 rcu_read_lock(); - 4 WRITE_ONCE(a, 1); - 5 WRITE_ONCE(b, 1); - 6 rcu_read_unlock(); - 7 } - 8 - 9 void thread1(void) -10 { -11 r1 = READ_ONCE(a); -12 synchronize_rcu(); -13 WRITE_ONCE(c, 1); -14 } -15 -16 void thread2(void) -17 { -18 r2 = READ_ONCE(c); -19 synchronize_rcu(); -20 WRITE_ONCE(d, 1); -21 } -22 -23 void thread3(void) -24 { -25 rcu_read_lock(); -26 r3 = READ_ONCE(b); -27 r4 = READ_ONCE(d); -28 rcu_read_unlock(); -29 } -</pre> -</blockquote> - -<p> -Here, if <tt>(r1 == 1)</tt>, then -<tt>thread0()</tt>'s write to <tt>b</tt> must happen -before the end of <tt>thread1()</tt>'s grace period. -If in addition <tt>(r4 == 1)</tt>, then -<tt>thread3()</tt>'s read from <tt>b</tt> must happen -after the beginning of <tt>thread2()</tt>'s grace period. -If it is also the case that <tt>(r2 == 1)</tt>, then the -end of <tt>thread1()</tt>'s grace period must precede the -beginning of <tt>thread2()</tt>'s grace period. -This mean that the two RCU read-side critical sections cannot overlap, -guaranteeing that <tt>(r3 == 1)</tt>. -As a result, the outcome: - -<blockquote> -<pre> -(r1 == 1 && r2 == 1 && r3 == 0 && r4 == 1) -</pre> -</blockquote> - -cannot happen. - -<p> -This non-requirement was also non-premeditated, but became apparent -when studying RCU's interaction with memory ordering. - -<h3><a name="Read-Side Critical Sections Don't Partition Grace Periods"> -Read-Side Critical Sections Don't Partition Grace Periods</a></h3> - -<p> -It is also tempting to assume that if an RCU read-side critical section -happens between a pair of grace periods, then those grace periods cannot -overlap. -However, this temptation leads nowhere good, as can be illustrated by -the following, with all variables initially zero: - -<blockquote> -<pre> - 1 void thread0(void) - 2 { - 3 rcu_read_lock(); - 4 WRITE_ONCE(a, 1); - 5 WRITE_ONCE(b, 1); - 6 rcu_read_unlock(); - 7 } - 8 - 9 void thread1(void) -10 { -11 r1 = READ_ONCE(a); -12 synchronize_rcu(); -13 WRITE_ONCE(c, 1); -14 } -15 -16 void thread2(void) -17 { -18 rcu_read_lock(); -19 WRITE_ONCE(d, 1); -20 r2 = READ_ONCE(c); -21 rcu_read_unlock(); -22 } -23 -24 void thread3(void) -25 { -26 r3 = READ_ONCE(d); -27 synchronize_rcu(); -28 WRITE_ONCE(e, 1); -29 } -30 -31 void thread4(void) -32 { -33 rcu_read_lock(); -34 r4 = READ_ONCE(b); -35 r5 = READ_ONCE(e); -36 rcu_read_unlock(); -37 } -</pre> -</blockquote> - -<p> -In this case, the outcome: - -<blockquote> -<pre> -(r1 == 1 && r2 == 1 && r3 == 1 && r4 == 0 && r5 == 1) -</pre> -</blockquote> - -is entirely possible, as illustrated below: - -<p><img src="ReadersPartitionGP1.svg" alt="ReadersPartitionGP1.svg" width="100%"></p> - -<p> -Again, an RCU read-side critical section can overlap almost all of a -given grace period, just so long as it does not overlap the entire -grace period. -As a result, an RCU read-side critical section cannot partition a pair -of RCU grace periods. - -<p>@@QQ@@ -How long a sequence of grace periods, each separated by an RCU read-side -critical section, would be required to partition the RCU read-side -critical sections at the beginning and end of the chain? -<p>@@QQA@@ -In theory, an infinite number. -In practice, an unknown number that is sensitive to both implementation -details and timing considerations. -Therefore, even in practice, RCU users must abide by the theoretical rather -than the practical answer. -<p>@@QQE@@ - -<h3><a name="Disabling Preemption Does Not Block Grace Periods"> -Disabling Preemption Does Not Block Grace Periods</a></h3> - -<p> -There was a time when disabling preemption on any given CPU would block -subsequent grace periods. -However, this was an accident of implementation and is not a requirement. -And in the current Linux-kernel implementation, disabling preemption -on a given CPU in fact does not block grace periods, as Oleg Nesterov -<a href="https://lkml.kernel.org/g/20150614193825.GA19582@redhat.com">demonstrated</a>. - -<p> -If you need a preempt-disable region to block grace periods, you need to add -<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>, for example -as follows: - -<blockquote> -<pre> - 1 preempt_disable(); - 2 rcu_read_lock(); - 3 do_something(); - 4 rcu_read_unlock(); - 5 preempt_enable(); - 6 - 7 /* Spinlocks implicitly disable preemption. */ - 8 spin_lock(&mylock); - 9 rcu_read_lock(); -10 do_something(); -11 rcu_read_unlock(); -12 spin_unlock(&mylock); -</pre> -</blockquote> - -<p> -In theory, you could enter the RCU read-side critical section first, -but it is more efficient to keep the entire RCU read-side critical -section contained in the preempt-disable region as shown above. -Of course, RCU read-side critical sections that extend outside of -preempt-disable regions will work correctly, but such critical sections -can be preempted, which forces <tt>rcu_read_unlock()</tt> to do -more work. -And no, this is <i>not</i> an invitation to enclose all of your RCU -read-side critical sections within preempt-disable regions, because -doing so would degrade real-time response. - -<p> -This non-requirement appeared with preemptible RCU. -If you need a grace period that waits on non-preemptible code regions, use -<a href="#Sched Flavor">RCU-sched</a>. - -<h2><a name="Parallelism Facts of Life">Parallelism Facts of Life</a></h2> - -<p> -These parallelism facts of life are by no means specific to RCU, but -the RCU implementation must abide by them. -They therefore bear repeating: - -<ol> -<li> Any CPU or task may be delayed at any time, - and any attempts to avoid these delays by disabling - preemption, interrupts, or whatever are completely futile. - This is most obvious in preemptible user-level - environments and in virtualized environments (where - a given guest OS's VCPUs can be preempted at any time by - the underlying hypervisor), but can also happen in bare-metal - environments due to ECC errors, NMIs, and other hardware - events. - Although a delay of more than about 20 seconds can result - in splats, the RCU implementation is obligated to use - algorithms that can tolerate extremely long delays, but where - “extremely long” is not long enough to allow - wrap-around when incrementing a 64-bit counter. -<li> Both the compiler and the CPU can reorder memory accesses. - Where it matters, RCU must use compiler directives and - memory-barrier instructions to preserve ordering. -<li> Conflicting writes to memory locations in any given cache line - will result in expensive cache misses. - Greater numbers of concurrent writes and more-frequent - concurrent writes will result in more dramatic slowdowns. - RCU is therefore obligated to use algorithms that have - sufficient locality to avoid significant performance and - scalability problems. -<li> As a rough rule of thumb, only one CPU's worth of processing - may be carried out under the protection of any given exclusive - lock. - RCU must therefore use scalable locking designs. -<li> Counters are finite, especially on 32-bit systems. - RCU's use of counters must therefore tolerate counter wrap, - or be designed such that counter wrap would take way more - time than a single system is likely to run. - An uptime of ten years is quite possible, a runtime - of a century much less so. - As an example of the latter, RCU's dyntick-idle nesting counter - allows 54 bits for interrupt nesting level (this counter - is 64 bits even on a 32-bit system). - Overflowing this counter requires 2<sup>54</sup> - half-interrupts on a given CPU without that CPU ever going idle. - If a half-interrupt happened every microsecond, it would take - 570 years of runtime to overflow this counter, which is currently - believed to be an acceptably long time. -<li> Linux systems can have thousands of CPUs running a single - Linux kernel in a single shared-memory environment. - RCU must therefore pay close attention to high-end scalability. -</ol> - -<p> -This last parallelism fact of life means that RCU must pay special -attention to the preceding facts of life. -The idea that Linux might scale to systems with thousands of CPUs would -have been met with some skepticism in the 1990s, but these requirements -would have otherwise have been unsurprising, even in the early 1990s. - -<h2><a name="Quality-of-Implementation Requirements">Quality-of-Implementation Requirements</a></h2> - -<p> -These sections list quality-of-implementation requirements. -Although an RCU implementation that ignores these requirements could -still be used, it would likely be subject to limitations that would -make it inappropriate for industrial-strength production use. -Classes of quality-of-implementation requirements are as follows: - -<ol> -<li> <a href="#Specialization">Specialization</a> -<li> <a href="#Performance and Scalability">Performance and Scalability</a> -<li> <a href="#Composability">Composability</a> -<li> <a href="#Corner Cases">Corner Cases</a> -</ol> - -<p> -These classes is covered in the following sections. - -<h3><a name="Specialization">Specialization</a></h3> - -<p> -RCU is and always has been intended primarily for read-mostly situations, as -illustrated by the following figure. -This means that RCU's read-side primitives are optimized, often at the -expense of its update-side primitives. - -<p><img src="RCUApplicability.svg" alt="RCUApplicability.svg" width="70%"></p> - -<p> -This focus on read-mostly situations means that RCU must interoperate -with other synchronization primitives. -For example, the <tt>add_gp()</tt> and <tt>remove_gp_synchronous()</tt> -examples discussed earlier use RCU to protect readers and locking to -coordinate updaters. -However, the need extends much farther, requiring that a variety of -synchronization primitives be legal within RCU read-side critical sections, -including spinlocks, sequence locks, atomic operations, reference -counters, and memory barriers. - -<p>@@QQ@@ -What about sleeping locks? -<p>@@QQA@@ -These are forbidden within Linux-kernel RCU read-side critical sections -because it is not legal to place a quiescent state (in this case, -voluntary context switch) within an RCU read-side critical section. -However, sleeping locks may be used within userspace RCU read-side critical -sections, and also within Linux-kernel sleepable RCU -<a href="#Sleepable RCU">(SRCU)</a> -read-side critical sections. -In addition, the -rt patchset turns spinlocks into a sleeping locks so -that the corresponding critical sections can be preempted, which -also means that these sleeplockified spinlocks (but not other sleeping locks!) -may be acquire within -rt-Linux-kernel RCU read-side critical sections. - -<p> -Note that it <i>is</i> legal for a normal RCU read-side critical section -to conditionally acquire a sleeping locks (as in <tt>mutex_trylock()</tt>), -but only as long as it does not loop indefinitely attempting to -conditionally acquire that sleeping locks. -The key point is that things like <tt>mutex_trylock()</tt> -either return with the mutex held, or return an error indication if -the mutex was not immediately available. -Either way, <tt>mutex_trylock()</tt> returns immediately without sleeping. -<p>@@QQE@@ - -<p> -It often comes as a surprise that many algorithms do not require a -consistent view of data, but many can function in that mode, -with network routing being the poster child. -Internet routing algorithms take significant time to propagate -updates, so that by the time an update arrives at a given system, -that system has been sending network traffic the wrong way for -a considerable length of time. -Having a few threads continue to send traffic the wrong way for a -few more milliseconds is clearly not a problem: In the worst case, -TCP retransmissions will eventually get the data where it needs to go. -In general, when tracking the state of the universe outside of the -computer, some level of inconsistency must be tolerated due to -speed-of-light delays if nothing else. - -<p> -Furthermore, uncertainty about external state is inherent in many cases. -For example, a pair of veternarians might use heartbeat to determine -whether or not a given cat was alive. -But how long should they wait after the last heartbeat to decide that -the cat is in fact dead? -Waiting less than 400 milliseconds makes no sense because this would -mean that a relaxed cat would be considered to cycle between death -and life more than 100 times per minute. -Moreover, just as with human beings, a cat's heart might stop for -some period of time, so the exact wait period is a judgment call. -One of our pair of veternarians might wait 30 seconds before pronouncing -the cat dead, while the other might insist on waiting a full minute. -The two veternarians would then disagree on the state of the cat during -the final 30 seconds of the minute following the last heartbeat, as -fancifully illustrated below: - -<p><img src="2013-08-is-it-dead.png" alt="2013-08-is-it-dead.png" width="431"></p> - -<p> -Interestingly enough, this same situation applies to hardware. -When push comes to shove, how do we tell whether or not some -external server has failed? -We send messages to it periodically, and declare it failed if we -don't receive a response within a given period of time. -Policy decisions can usually tolerate short -periods of inconsistency. -The policy was decided some time ago, and is only now being put into -effect, so a few milliseconds of delay is normally inconsequential. - -<p> -However, there are algorithms that absolutely must see consistent data. -For example, the translation between a user-level SystemV semaphore -ID to the corresponding in-kernel data structure is protected by RCU, -but it is absolutely forbidden to update a semaphore that has just been -removed. -In the Linux kernel, this need for consistency is accommodated by acquiring -spinlocks located in the in-kernel data structure from within -the RCU read-side critical section, and this is indicated by the -green box in the figure above. -Many other techniques may be used, and are in fact used within the -Linux kernel. - -<p> -In short, RCU is not required to maintain consistency, and other -mechanisms may be used in concert with RCU when consistency is required. -RCU's specialization allows it to do its job extremely well, and its -ability to interoperate with other synchronization mechanisms allows -the right mix of synchronization tools to be used for a given job. - -<h3><a name="Performance and Scalability">Performance and Scalability</a></h3> - -<p> -Energy efficiency is a critical component of performance today, -and Linux-kernel RCU implementations must therefore avoid unnecessarily -awakening idle CPUs. -I cannot claim that this requirement was premeditated. -In fact, I learned of it during a telephone conversation in which I -was given “frank and open” feedback on the importance -of energy efficiency in battery-powered systems and on specific -energy-efficiency shortcomings of the Linux-kernel RCU implementation. -In my experience, the battery-powered embedded community will consider -any unnecessary wakeups to be extremely unfriendly acts. -So much so that mere Linux-kernel-mailing-list posts are -insufficient to vent their ire. - -<p> -Memory consumption is not particularly important for in most -situations, and has become decreasingly -so as memory sizes have expanded and memory -costs have plummeted. -However, as I learned from Matt Mackall's -<a href="http://elinux.org/Linux_Tiny-FAQ">bloatwatch</a> -efforts, memory footprint is critically important on single-CPU systems with -non-preemptible (<tt>CONFIG_PREEMPT=n</tt>) kernels, and thus -<a href="https://lkml.kernel.org/g/20090113221724.GA15307@linux.vnet.ibm.com">tiny RCU</a> -was born. -Josh Triplett has since taken over the small-memory banner with his -<a href="https://tiny.wiki.kernel.org/">Linux kernel tinification</a> -project, which resulted in -<a href="#Sleepable RCU">SRCU</a> -becoming optional for those kernels not needing it. - -<p> -The remaining performance requirements are, for the most part, -unsurprising. -For example, in keeping with RCU's read-side specialization, -<tt>rcu_dereference()</tt> should have negligible overhead (for -example, suppression of a few minor compiler optimizations). -Similarly, in non-preemptible environments, <tt>rcu_read_lock()</tt> and -<tt>rcu_read_unlock()</tt> should have exactly zero overhead. - -<p> -In preemptible environments, in the case where the RCU read-side -critical section was not preempted (as will be the case for the -highest-priority real-time process), <tt>rcu_read_lock()</tt> and -<tt>rcu_read_unlock()</tt> should have minimal overhead. -In particular, they should not contain atomic read-modify-write -operations, memory-barrier instructions, preemption disabling, -interrupt disabling, or backwards branches. -However, in the case where the RCU read-side critical section was preempted, -<tt>rcu_read_unlock()</tt> may acquire spinlocks and disable interrupts. -This is why it is better to nest an RCU read-side critical section -within a preempt-disable region than vice versa, at least in cases -where that critical section is short enough to avoid unduly degrading -real-time latencies. - -<p> -The <tt>synchronize_rcu()</tt> grace-period-wait primitive is -optimized for throughput. -It may therefore incur several milliseconds of latency in addition to -the duration of the longest RCU read-side critical section. -On the other hand, multiple concurrent invocations of -<tt>synchronize_rcu()</tt> are required to use batching optimizations -so that they can be satisfied by a single underlying grace-period-wait -operation. -For example, in the Linux kernel, it is not unusual for a single -grace-period-wait operation to serve more than -<a href="https://www.usenix.org/conference/2004-usenix-annual-technical-conference/making-rcu-safe-deep-sub-millisecond-response">1,000 separate invocations</a> -of <tt>synchronize_rcu()</tt>, thus amortizing the per-invocation -overhead down to nearly zero. -However, the grace-period optimization is also required to avoid -measurable degradation of real-time scheduling and interrupt latencies. - -<p> -In some cases, the multi-millisecond <tt>synchronize_rcu()</tt> -latencies are unacceptable. -In these cases, <tt>synchronize_rcu_expedited()</tt> may be used -instead, reducing the grace-period latency down to a few tens of -microseconds on small systems, at least in cases where the RCU read-side -critical sections are short. -There are currently no special latency requirements for -<tt>synchronize_rcu_expedited()</tt> on large systems, but, -consistent with the empirical nature of the RCU specification, -that is subject to change. -However, there most definitely are scalability requirements: -A storm of <tt>synchronize_rcu_expedited()</tt> invocations on 4096 -CPUs should at least make reasonable forward progress. -In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt> -is permitted to impose modest degradation of real-time latency -on non-idle online CPUs. -That said, it will likely be necessary to take further steps to reduce this -degradation, hopefully to roughly that of a scheduling-clock interrupt. - -<p> -There are a number of situations where even -<tt>synchronize_rcu_expedited()</tt>'s reduced grace-period -latency is unacceptable. -In these situations, the asynchronous <tt>call_rcu()</tt> can be -used in place of <tt>synchronize_rcu()</tt> as follows: - -<blockquote> -<pre> - 1 struct foo { - 2 int a; - 3 int b; - 4 struct rcu_head rh; - 5 }; - 6 - 7 static void remove_gp_cb(struct rcu_head *rhp) - 8 { - 9 struct foo *p = container_of(rhp, struct foo, rh); -10 -11 kfree(p); -12 } -13 -14 bool remove_gp_asynchronous(void) -15 { -16 struct foo *p; -17 -18 spin_lock(&gp_lock); -19 p = rcu_dereference(gp); -20 if (!p) { -21 spin_unlock(&gp_lock); -22 return false; -23 } -24 rcu_assign_pointer(gp, NULL); -25 call_rcu(&p->rh, remove_gp_cb); -26 spin_unlock(&gp_lock); -27 return true; -28 } -</pre> -</blockquote> - -<p> -A definition of <tt>struct foo</tt> is finally needed, and appears -on lines 1-5. -The function <tt>remove_gp_cb()</tt> is passed to <tt>call_rcu()</tt> -on line 25, and will be invoked after the end of a subsequent -grace period. -This gets the same effect as <tt>remove_gp_synchronous()</tt>, -but without forcing the updater to wait for a grace period to elapse. -The <tt>call_rcu()</tt> function may be used in a number of -situations where neither <tt>synchronize_rcu()</tt> nor -<tt>synchronize_rcu_expedited()</tt> would be legal, -including within preempt-disable code, <tt>local_bh_disable()</tt> code, -interrupt-disable code, and interrupt handlers. -However, even <tt>call_rcu()</tt> is illegal within NMI handlers. -The callback function (<tt>remove_gp_cb()</tt> in this case) will be -executed within softirq (software interrupt) environment within the -Linux kernel, -either within a real softirq handler or under the protection -of <tt>local_bh_disable()</tt>. -In both the Linux kernel and in userspace, it is bad practice to -write an RCU callback function that takes too long. -Long-running operations should be relegated to separate threads or -(in the Linux kernel) workqueues. - -<p>@@QQ@@ -Why does line 19 use <tt>rcu_access_pointer()</tt>? -After all, <tt>call_rcu()</tt> on line 25 stores into the -structure, which would interact badly with concurrent insertions. -Doesn't this mean that <tt>rcu_dereference()</tt> is required? -<p>@@QQA@@ -Presumably the <tt>->gp_lock</tt> acquired on line 18 excludes -any changes, including any insertions that <tt>rcu_dereference()</tt> -would protect against. -Therefore, any insertions will be delayed until after <tt>->gp_lock</tt> -is released on line 25, which in turn means that -<tt>rcu_access_pointer()</tt> suffices. -<p>@@QQE@@ - -<p> -However, all that <tt>remove_gp_cb()</tt> is doing is -invoking <tt>kfree()</tt> on the data element. -This is a common idiom, and is supported by <tt>kfree_rcu()</tt>, -which allows “fire and forget” operation as shown below: - -<blockquote> -<pre> - 1 struct foo { - 2 int a; - 3 int b; - 4 struct rcu_head rh; - 5 }; - 6 - 7 bool remove_gp_faf(void) - 8 { - 9 struct foo *p; -10 -11 spin_lock(&gp_lock); -12 p = rcu_dereference(gp); -13 if (!p) { -14 spin_unlock(&gp_lock); -15 return false; -16 } -17 rcu_assign_pointer(gp, NULL); -18 kfree_rcu(p, rh); -19 spin_unlock(&gp_lock); -20 return true; -21 } -</pre> -</blockquote> - -<p> -Note that <tt>remove_gp_faf()</tt> simply invokes -<tt>kfree_rcu()</tt> and proceeds, without any need to pay any -further attention to the subsequent grace period and <tt>kfree()</tt>. -It is permissible to invoke <tt>kfree_rcu()</tt> from the same -environments as for <tt>call_rcu()</tt>. -Interestingly enough, DYNIX/ptx had the equivalents of -<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>, but not -<tt>synchronize_rcu()</tt>. -This was due to the fact that RCU was not heavily used within DYNIX/ptx, -so the very few places that needed something like -<tt>synchronize_rcu()</tt> simply open-coded it. - -<p>@@QQ@@ -Earlier it was claimed that <tt>call_rcu()</tt> and -<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked -by readers. -But how can that be correct, given that the invocation of the callback -and the freeing of the memory (respectively) must still wait for -a grace period to elapse? -<p>@@QQA@@ -We could define things this way, but keep in mind that this sort of -definition would say that updates in garbage-collected languages -cannot complete until the next time the garbage collector runs, -which does not seem at all reasonable. -The key point is that in most cases, an updater using either -<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the -next update as soon as it has invoked <tt>call_rcu()</tt> or -<tt>kfree_rcu()</tt>, without having to wait for a subsequent -grace period. -<p>@@QQE@@ - -<p> -But what if the updater must wait for the completion of code to be -executed after the end of the grace period, but has other tasks -that can be carried out in the meantime? -The polling-style <tt>get_state_synchronize_rcu()</tt> and -<tt>cond_synchronize_rcu()</tt> functions may be used for this -purpose, as shown below: - -<blockquote> -<pre> - 1 bool remove_gp_poll(void) - 2 { - 3 struct foo *p; - 4 unsigned long s; - 5 - 6 spin_lock(&gp_lock); - 7 p = rcu_access_pointer(gp); - 8 if (!p) { - 9 spin_unlock(&gp_lock); -10 return false; -11 } -12 rcu_assign_pointer(gp, NULL); -13 spin_unlock(&gp_lock); -14 s = get_state_synchronize_rcu(); -15 do_something_while_waiting(); -16 cond_synchronize_rcu(s); -17 kfree(p); -18 return true; -19 } -</pre> -</blockquote> - -<p> -On line 14, <tt>get_state_synchronize_rcu()</tt> obtains a -“cookie” from RCU, -then line 15 carries out other tasks, -and finally, line 16 returns immediately if a grace period has -elapsed in the meantime, but otherwise waits as required. -The need for <tt>get_state_synchronize_rcu</tt> and -<tt>cond_synchronize_rcu()</tt> has appeared quite recently, -so it is too early to tell whether they will stand the test of time. - -<p> -RCU thus provides a range of tools to allow updaters to strike the -required tradeoff between latency, flexibility and CPU overhead. - -<h3><a name="Composability">Composability</a></h3> - -<p> -Composability has received much attention in recent years, perhaps in part -due to the collision of multicore hardware with object-oriented techniques -designed in single-threaded environments for single-threaded use. -And in theory, RCU read-side critical sections may be composed, and in -fact may be nested arbitrarily deeply. -In practice, as with all real-world implementations of composable -constructs, there are limitations. - -<p> -Implementations of RCU for which <tt>rcu_read_lock()</tt> -and <tt>rcu_read_unlock()</tt> generate no code, such as -Linux-kernel RCU when <tt>CONFIG_PREEMPT=n</tt>, can be -nested arbitrarily deeply. -After all, there is no overhead. -Except that if all these instances of <tt>rcu_read_lock()</tt> -and <tt>rcu_read_unlock()</tt> are visible to the compiler, -compilation will eventually fail due to exhausting memory, -mass storage, or user patience, whichever comes first. -If the nesting is not visible to the compiler, as is the case with -mutually recursive functions each in its own translation unit, -stack overflow will result. -If the nesting takes the form of loops, either the control variable -will overflow or (in the Linux kernel) you will get an RCU CPU stall warning. -Nevertheless, this class of RCU implementations is one -of the most composable constructs in existence. - -<p> -RCU implementations that explicitly track nesting depth -are limited by the nesting-depth counter. -For example, the Linux kernel's preemptible RCU limits nesting to -<tt>INT_MAX</tt>. -This should suffice for almost all practical purposes. -That said, a consecutive pair of RCU read-side critical sections -between which there is an operation that waits for a grace period -cannot be enclosed in another RCU read-side critical section. -This is because it is not legal to wait for a grace period within -an RCU read-side critical section: To do so would result either -in deadlock or -in RCU implicitly splitting the enclosing RCU read-side critical -section, neither of which is conducive to a long-lived and prosperous -kernel. - -<p> -It is worth noting that RCU is not alone in limiting composability. -For example, many transactional-memory implementations prohibit -composing a pair of transactions separated by an irrevocable -operation (for example, a network receive operation). -For another example, lock-based critical sections can be composed -surprisingly freely, but only if deadlock is avoided. - -<p> -In short, although RCU read-side critical sections are highly composable, -care is required in some situations, just as is the case for any other -composable synchronization mechanism. - -<h3><a name="Corner Cases">Corner Cases</a></h3> - -<p> -A given RCU workload might have an endless and intense stream of -RCU read-side critical sections, perhaps even so intense that there -was never a point in time during which there was not at least one -RCU read-side critical section in flight. -RCU cannot allow this situation to block grace periods: As long as -all the RCU read-side critical sections are finite, grace periods -must also be finite. - -<p> -That said, preemptible RCU implementations could potentially result -in RCU read-side critical sections being preempted for long durations, -which has the effect of creating a long-duration RCU read-side -critical section. -This situation can arise only in heavily loaded systems, but systems using -real-time priorities are of course more vulnerable. -Therefore, RCU priority boosting is provided to help deal with this -case. -That said, the exact requirements on RCU priority boosting will likely -evolve as more experience accumulates. - -<p> -Other workloads might have very high update rates. -Although one can argue that such workloads should instead use -something other than RCU, the fact remains that RCU must -handle such workloads gracefully. -This requirement is another factor driving batching of grace periods, -but it is also the driving force behind the checks for large numbers -of queued RCU callbacks in the <tt>call_rcu()</tt> code path. -Finally, high update rates should not delay RCU read-side critical -sections, although some read-side delays can occur when using -<tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use -of <tt>try_stop_cpus()</tt>. -(In the future, <tt>synchronize_rcu_expedited()</tt> will be -converted to use lighter-weight inter-processor interrupts (IPIs), -but this will still disturb readers, though to a much smaller degree.) - -<p> -Although all three of these corner cases were understood in the early -1990s, a simple user-level test consisting of <tt>close(open(path))</tt> -in a tight loop -in the early 2000s suddenly provided a much deeper appreciation of the -high-update-rate corner case. -This test also motivated addition of some RCU code to react to high update -rates, for example, if a given CPU finds itself with more than 10,000 -RCU callbacks queued, it will cause RCU to take evasive action by -more aggressively starting grace periods and more aggressively forcing -completion of grace-period processing. -This evasive action causes the grace period to complete more quickly, -but at the cost of restricting RCU's batching optimizations, thus -increasing the CPU overhead incurred by that grace period. - -<h2><a name="Software-Engineering Requirements"> -Software-Engineering Requirements</a></h2> - -<p> -Between Murphy's Law and “To err is human”, it is necessary to -guard against mishaps and misuse: - -<ol> -<li> It is all too easy to forget to use <tt>rcu_read_lock()</tt> - everywhere that it is needed, so kernels built with - <tt>CONFIG_PROVE_RCU=y</tt> will spat if - <tt>rcu_dereference()</tt> is used outside of an - RCU read-side critical section. - Update-side code can use <tt>rcu_dereference_protected()</tt>, - which takes a - <a href="https://lwn.net/Articles/371986/">lockdep expression</a> - to indicate what is providing the protection. - If the indicated protection is not provided, a lockdep splat - is emitted. - - <p> - Code shared between readers and updaters can use - <tt>rcu_dereference_check()</tt>, which also takes a - lockdep expression, and emits a lockdep splat if neither - <tt>rcu_read_lock()</tt> nor the indicated protection - is in place. - In addition, <tt>rcu_dereference_raw()</tt> is used in those - (hopefully rare) cases where the required protection cannot - be easily described. - Finally, <tt>rcu_read_lock_held()</tt> is provided to - allow a function to verify that it has been invoked within - an RCU read-side critical section. - I was made aware of this set of requirements shortly after Thomas - Gleixner audited a number of RCU uses. -<li> A given function might wish to check for RCU-related preconditions - upon entry, before using any other RCU API. - The <tt>rcu_lockdep_assert()</tt> does this job, - asserting the expression in kernels having lockdep enabled - and doing nothing otherwise. -<li> It is also easy to forget to use <tt>rcu_assign_pointer()</tt> - and <tt>rcu_dereference()</tt>, perhaps (incorrectly) - substituting a simple assignment. - To catch this sort of error, a given RCU-protected pointer may be - tagged with <tt>__rcu</tt>, after which running sparse - with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt> will complain - about simple-assignment accesses to that pointer. - Arnd Bergmann made me aware of this requirement, and also - supplied the needed - <a href="https://lwn.net/Articles/376011/">patch series</a>. -<li> Kernels built with <tt>CONFIG_DEBUG_OBJECTS_RCU_HEAD=y</tt> - will splat if a data element is passed to <tt>call_rcu()</tt> - twice in a row, without a grace period in between. - (This error is similar to a double free.) - The corresponding <tt>rcu_head</tt> structures that are - dynamically allocated are automatically tracked, but - <tt>rcu_head</tt> structures allocated on the stack - must be initialized with <tt>init_rcu_head_on_stack()</tt> - and cleaned up with <tt>destroy_rcu_head_on_stack()</tt>. - Similarly, statically allocated non-stack <tt>rcu_head</tt> - structures must be initialized with <tt>init_rcu_head()</tt> - and cleaned up with <tt>destroy_rcu_head()</tt>. - Mathieu Desnoyers made me aware of this requirement, and also - supplied the needed - <a href="https://lkml.kernel.org/g/20100319013024.GA28456@Krystal">patch</a>. -<li> An infinite loop in an RCU read-side critical section will - eventually trigger an RCU CPU stall warning splat, with - the duration of “eventually” being controlled by the - <tt>RCU_CPU_STALL_TIMEOUT</tt> <tt>Kconfig</tt> option, or, - alternatively, by the - <tt>rcupdate.rcu_cpu_stall_timeout</tt> boot/sysfs - parameter. - However, RCU is not obligated to produce this splat - unless there is a grace period waiting on that particular - RCU read-side critical section. - <p> - Some extreme workloads might intentionally delay - RCU grace periods, and systems running those workloads can - be booted with <tt>rcupdate.rcu_cpu_stall_suppress</tt> - to suppress the splats. - This kernel parameter may also be set via <tt>sysfs</tt>. - Furthermore, RCU CPU stall warnings are counter-productive - during sysrq dumps and during panics. - RCU therefore supplies the <tt>rcu_sysrq_start()</tt> and - <tt>rcu_sysrq_end()</tt> API members to be called before - and after long sysrq dumps. - RCU also supplies the <tt>rcu_panic()</tt> notifier that is - automatically invoked at the beginning of a panic to suppress - further RCU CPU stall warnings. - - <p> - This requirement made itself known in the early 1990s, pretty - much the first time that it was necessary to debug a CPU stall. - That said, the initial implementation in DYNIX/ptx was quite - generic in comparison with that of Linux. -<li> Although it would be very good to detect pointers leaking out - of RCU read-side critical sections, there is currently no - good way of doing this. - One complication is the need to distinguish between pointers - leaking and pointers that have been handed off from RCU to - some other synchronization mechanism, for example, reference - counting. -<li> In kernels built with <tt>CONFIG_RCU_TRACE=y</tt>, RCU-related - information is provided via both debugfs and event tracing. -<li> Open-coded use of <tt>rcu_assign_pointer()</tt> and - <tt>rcu_dereference()</tt> to create typical linked - data structures can be surprisingly error-prone. - Therefore, RCU-protected - <a href="https://lwn.net/Articles/609973/#RCU List APIs">linked lists</a> - and, more recently, RCU-protected - <a href="https://lwn.net/Articles/612100/">hash tables</a> - are available. - Many other special-purpose RCU-protected data structures are - available in the Linux kernel and the userspace RCU library. -<li> Some linked structures are created at compile time, but still - require <tt>__rcu</tt> checking. - The <tt>RCU_POINTER_INITIALIZER()</tt> macro serves this - purpose. -<li> It is not necessary to use <tt>rcu_assign_pointer()</tt> - when creating linked structures that are to be published via - a single external pointer. - The <tt>RCU_INIT_POINTER()</tt> macro is provided for - this task and also for assigning <tt>NULL</tt> pointers - at runtime. -</ol> - -<p> -This not a hard-and-fast list: RCU's diagnostic capabilities will -continue to be guided by the number and type of usage bugs found -in real-world RCU usage. - -<h2><a name="Linux Kernel Complications">Linux Kernel Complications</a></h2> - -<p> -The Linux kernel provides an interesting environment for all kinds of -software, including RCU. -Some of the relevant points of interest are as follows: - -<ol> -<li> <a href="#Configuration">Configuration</a>. -<li> <a href="#Firmware Interface">Firmware Interface</a>. -<li> <a href="#Early Boot">Early Boot</a>. -<li> <a href="#Interrupts and NMIs"> - Interrupts and non-maskable interrupts (NMIs)</a>. -<li> <a href="#Loadable Modules">Loadable Modules</a>. -<li> <a href="#Hotplug CPU">Hotplug CPU</a>. -<li> <a href="#Scheduler and RCU">Scheduler and RCU</a>. -<li> <a href="#Tracing and RCU">Tracing and RCU</a>. -<li> <a href="#Energy Efficiency">Energy Efficiency</a>. -<li> <a href="#Memory Efficiency">Memory Efficiency</a>. -<li> <a href="#Performance, Scalability, Response Time, and Reliability"> - Performance, Scalability, Response Time, and Reliability</a>. -</ol> - -<p> -This list is probably incomplete, but it does give a feel for the -most notable Linux-kernel complications. -Each of the following sections covers one of the above topics. - -<h3><a name="Configuration">Configuration</a></h3> - -<p> -RCU's goal is automatic configuration, so that almost nobody -needs to worry about RCU's <tt>Kconfig</tt> options. -And for almost all users, RCU does in fact work well -“out of the box.” - -<p> -However, there are specialized use cases that are handled by -kernel boot parameters and <tt>Kconfig</tt> options. -Unfortunately, the <tt>Kconfig</tt> system will explicitly ask users -about new <tt>Kconfig</tt> options, which requires almost all of them -be hidden behind a <tt>CONFIG_RCU_EXPERT</tt> <tt>Kconfig</tt> option. - -<p> -This all should be quite obvious, but the fact remains that -Linus Torvalds recently had to -<a href="https://lkml.kernel.org/g/CA+55aFy4wcCwaL4okTs8wXhGZ5h-ibecy_Meg9C4MNQrUnwMcg@mail.gmail.com">remind</a> -me of this requirement. - -<h3><a name="Firmware Interface">Firmware Interface</a></h3> - -<p> -In many cases, kernel obtains information about the system from the -firmware, and sometimes things are lost in translation. -Or the translation is accurate, but the original message is bogus. - -<p> -For example, some systems' firmware overreports the number of CPUs, -sometimes by a large factor. -If RCU naively believed the firmware, as it used to do, -it would create too many per-CPU kthreads. -Although the resulting system will still run correctly, the extra -kthreads needlessly consume memory and can cause confusion -when they show up in <tt>ps</tt> listings. - -<p> -RCU must therefore wait for a given CPU to actually come online before -it can allow itself to believe that the CPU actually exists. -The resulting “ghost CPUs” (which are never going to -come online) cause a number of -<a href="https://paulmck.livejournal.com/37494.html">interesting complications</a>. - -<h3><a name="Early Boot">Early Boot</a></h3> - -<p> -The Linux kernel's boot sequence is an interesting process, -and RCU is used early, even before <tt>rcu_init()</tt> -is invoked. -In fact, a number of RCU's primitives can be used as soon as the -initial task's <tt>task_struct</tt> is available and the -boot CPU's per-CPU variables are set up. -The read-side primitives (<tt>rcu_read_lock()</tt>, -<tt>rcu_read_unlock()</tt>, <tt>rcu_dereference()</tt>, -and <tt>rcu_access_pointer()</tt>) will operate normally very early on, -as will <tt>rcu_assign_pointer()</tt>. - -<p> -Although <tt>call_rcu()</tt> may be invoked at any -time during boot, callbacks are not guaranteed to be invoked until after -the scheduler is fully up and running. -This delay in callback invocation is due to the fact that RCU does not -invoke callbacks until it is fully initialized, and this full initialization -cannot occur until after the scheduler has initialized itself to the -point where RCU can spawn and run its kthreads. -In theory, it would be possible to invoke callbacks earlier, -however, this is not a panacea because there would be severe restrictions -on what operations those callbacks could invoke. - -<p> -Perhaps surprisingly, <tt>synchronize_rcu()</tt>, -<a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a> -(<a href="#Bottom-Half Flavor">discussed below</a>), -and -<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a> -will all operate normally -during very early boot, the reason being that there is only one CPU -and preemption is disabled. -This means that the call <tt>synchronize_rcu()</tt> (or friends) -itself is a quiescent -state and thus a grace period, so the early-boot implementation can -be a no-op. - -<p> -Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt> -continue to operate normally through the remainder of boot, courtesy -of the fact that preemption is disabled across their RCU read-side -critical sections and also courtesy of the fact that there is still -only one CPU. -However, once the scheduler starts initializing, preemption is enabled. -There is still only a single CPU, but the fact that preemption is enabled -means that the no-op implementation of <tt>synchronize_rcu()</tt> no -longer works in <tt>CONFIG_PREEMPT=y</tt> kernels. -Therefore, as soon as the scheduler starts initializing, the early-boot -fastpath is disabled. -This means that <tt>synchronize_rcu()</tt> switches to its runtime -mode of operation where it posts callbacks, which in turn means that -any call to <tt>synchronize_rcu()</tt> will block until the corresponding -callback is invoked. -Unfortunately, the callback cannot be invoked until RCU's runtime -grace-period machinery is up and running, which cannot happen until -the scheduler has initialized itself sufficiently to allow RCU's -kthreads to be spawned. -Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler -initialization can result in deadlock. - -<p>@@QQ@@ -So what happens with <tt>synchronize_rcu()</tt> during -scheduler initialization for <tt>CONFIG_PREEMPT=n</tt> -kernels? -<p>@@QQA@@ -In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt> -maps directly to <tt>synchronize_sched()</tt>. -Therefore, <tt>synchronize_rcu()</tt> works normally throughout -boot in <tt>CONFIG_PREEMPT=n</tt> kernels. -However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels, -so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt> -during scheduler initialization. -<p>@@QQE@@ - -<p> -I learned of these boot-time requirements as a result of a series of -system hangs. - -<h3><a name="Interrupts and NMIs">Interrupts and NMIs</a></h3> - -<p> -The Linux kernel has interrupts, and RCU read-side critical sections are -legal within interrupt handlers and within interrupt-disabled regions -of code, as are invocations of <tt>call_rcu()</tt>. - -<p> -Some Linux-kernel architectures can enter an interrupt handler from -non-idle process context, and then just never leave it, instead stealthily -transitioning back to process context. -This trick is sometimes used to invoke system calls from inside the kernel. -These “half-interrupts” mean that RCU has to be very careful -about how it counts interrupt nesting levels. -I learned of this requirement the hard way during a rewrite -of RCU's dyntick-idle code. - -<p> -The Linux kernel has non-maskable interrupts (NMIs), and -RCU read-side critical sections are legal within NMI handlers. -Thankfully, RCU update-side primitives, including -<tt>call_rcu()</tt>, are prohibited within NMI handlers. - -<p> -The name notwithstanding, some Linux-kernel architectures -can have nested NMIs, which RCU must handle correctly. -Andy Lutomirski -<a href="https://lkml.kernel.org/g/CALCETrXLq1y7e_dKFPgou-FKHB6Pu-r8+t-6Ds+8=va7anBWDA@mail.gmail.com">surprised me</a> -with this requirement; -he also kindly surprised me with -<a href="https://lkml.kernel.org/g/CALCETrXSY9JpW3uE6H8WYk81sg56qasA2aqmjMPsq5dOtzso=g@mail.gmail.com">an algorithm</a> -that meets this requirement. - -<h3><a name="Loadable Modules">Loadable Modules</a></h3> - -<p> -The Linux kernel has loadable modules, and these modules can -also be unloaded. -After a given module has been unloaded, any attempt to call -one of its functions results in a segmentation fault. -The module-unload functions must therefore cancel any -delayed calls to loadable-module functions, for example, -any outstanding <tt>mod_timer()</tt> must be dealt with -via <tt>del_timer_sync()</tt> or similar. - -<p> -Unfortunately, there is no way to cancel an RCU callback; -once you invoke <tt>call_rcu()</tt>, the callback function is -going to eventually be invoked, unless the system goes down first. -Because it is normally considered socially irresponsible to crash the system -in response to a module unload request, we need some other way -to deal with in-flight RCU callbacks. - -<p> -RCU therefore provides -<tt><a href="https://lwn.net/Articles/217484/">rcu_barrier()</a></tt>, -which waits until all in-flight RCU callbacks have been invoked. -If a module uses <tt>call_rcu()</tt>, its exit function should therefore -prevent any future invocation of <tt>call_rcu()</tt>, then invoke -<tt>rcu_barrier()</tt>. -In theory, the underlying module-unload code could invoke -<tt>rcu_barrier()</tt> unconditionally, but in practice this would -incur unacceptable latencies. - -<p> -Nikita Danilov noted this requirement for an analogous filesystem-unmount -situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU. -The need for <tt>rcu_barrier()</tt> for module unloading became -apparent later. - -<h3><a name="Hotplug CPU">Hotplug CPU</a></h3> - -<p> -The Linux kernel supports CPU hotplug, which means that CPUs -can come and go. -It is of course illegal to use any RCU API member from an offline CPU. -This requirement was present from day one in DYNIX/ptx, but -on the other hand, the Linux kernel's CPU-hotplug implementation -is “interesting.” - -<p> -The Linux-kernel CPU-hotplug implementation has notifiers that -are used to allow the various kernel subsystems (including RCU) -to respond appropriately to a given CPU-hotplug operation. -Most RCU operations may be invoked from CPU-hotplug notifiers, -including even normal synchronous grace-period operations -such as <tt>synchronize_rcu()</tt>. -However, expedited grace-period operations such as -<tt>synchronize_rcu_expedited()</tt> are not supported, -due to the fact that current implementations block CPU-hotplug -operations, which could result in deadlock. - -<p> -In addition, all-callback-wait operations such as -<tt>rcu_barrier()</tt> are also not supported, due to the -fact that there are phases of CPU-hotplug operations where -the outgoing CPU's callbacks will not be invoked until after -the CPU-hotplug operation ends, which could also result in deadlock. - -<h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3> - -<p> -RCU depends on the scheduler, and the scheduler uses RCU to -protect some of its data structures. -This means the scheduler is forbidden from acquiring -the runqueue locks and the priority-inheritance locks -in the middle of an outermost RCU read-side critical section unless either -(1) it releases them before exiting that same -RCU read-side critical section, or -(2) interrupts are disabled across -that entire RCU read-side critical section. -This same prohibition also applies (recursively!) to any lock that is acquired -while holding any lock to which this prohibition applies. -Adhering to this rule prevents preemptible RCU from invoking -<tt>rcu_read_unlock_special()</tt> while either runqueue or -priority-inheritance locks are held, thus avoiding deadlock. - -<p> -Prior to v4.4, it was only necessary to disable preemption across -RCU read-side critical sections that acquired scheduler locks. -In v4.4, expedited grace periods started using IPIs, and these -IPIs could force a <tt>rcu_read_unlock()</tt> to take the slowpath. -Therefore, this expedited-grace-period change required disabling of -interrupts, not just preemption. - -<p> -For RCU's part, the preemptible-RCU <tt>rcu_read_unlock()</tt> -implementation must be written carefully to avoid similar deadlocks. -In particular, <tt>rcu_read_unlock()</tt> must tolerate an -interrupt where the interrupt handler invokes both -<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>. -This possibility requires <tt>rcu_read_unlock()</tt> to use -negative nesting levels to avoid destructive recursion via -interrupt handler's use of RCU. - -<p> -This pair of mutual scheduler-RCU requirements came as a -<a href="https://lwn.net/Articles/453002/">complete surprise</a>. - -<p> -As noted above, RCU makes use of kthreads, and it is necessary to -avoid excessive CPU-time accumulation by these kthreads. -This requirement was no surprise, but RCU's violation of it -when running context-switch-heavy workloads when built with -<tt>CONFIG_NO_HZ_FULL=y</tt> -<a href="http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf">did come as a surprise [PDF]</a>. -RCU has made good progress towards meeting this requirement, even -for context-switch-have <tt>CONFIG_NO_HZ_FULL=y</tt> workloads, -but there is room for further improvement. - -<h3><a name="Tracing and RCU">Tracing and RCU</a></h3> - -<p> -It is possible to use tracing on RCU code, but tracing itself -uses RCU. -For this reason, <tt>rcu_dereference_raw_notrace()</tt> -is provided for use by tracing, which avoids the destructive -recursion that could otherwise ensue. -This API is also used by virtualization in some architectures, -where RCU readers execute in environments in which tracing -cannot be used. -The tracing folks both located the requirement and provided the -needed fix, so this surprise requirement was relatively painless. - -<h3><a name="Energy Efficiency">Energy Efficiency</a></h3> - -<p> -Interrupting idle CPUs is considered socially unacceptable, -especially by people with battery-powered embedded systems. -RCU therefore conserves energy by detecting which CPUs are -idle, including tracking CPUs that have been interrupted from idle. -This is a large part of the energy-efficiency requirement, -so I learned of this via an irate phone call. - -<p> -Because RCU avoids interrupting idle CPUs, it is illegal to -execute an RCU read-side critical section on an idle CPU. -(Kernels built with <tt>CONFIG_PROVE_RCU=y</tt> will splat -if you try it.) -The <tt>RCU_NONIDLE()</tt> macro and <tt>_rcuidle</tt> -event tracing is provided to work around this restriction. -In addition, <tt>rcu_is_watching()</tt> may be used to -test whether or not it is currently legal to run RCU read-side -critical sections on this CPU. -I learned of the need for diagnostics on the one hand -and <tt>RCU_NONIDLE()</tt> on the other while inspecting -idle-loop code. -Steven Rostedt supplied <tt>_rcuidle</tt> event tracing, -which is used quite heavily in the idle loop. - -<p> -It is similarly socially unacceptable to interrupt an -<tt>nohz_full</tt> CPU running in userspace. -RCU must therefore track <tt>nohz_full</tt> userspace -execution. -And in -<a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a> -kernels, RCU must separately track idle CPUs on the one hand and -CPUs that are either idle or executing in userspace on the other. -In both cases, RCU must be able to sample state at two points in -time, and be able to determine whether or not some other CPU spent -any time idle and/or executing in userspace. - -<p> -These energy-efficiency requirements have proven quite difficult to -understand and to meet, for example, there have been more than five -clean-sheet rewrites of RCU's energy-efficiency code, the last of -which was finally able to demonstrate -<a href="http://www.rdrop.com/users/paulmck/realtime/paper/AMPenergy.2013.04.19a.pdf">real energy savings running on real hardware [PDF]</a>. -As noted earlier, -I learned of many of these requirements via angry phone calls: -Flaming me on the Linux-kernel mailing list was apparently not -sufficient to fully vent their ire at RCU's energy-efficiency bugs! - -<h3><a name="Memory Efficiency">Memory Efficiency</a></h3> - -<p> -Although small-memory non-realtime systems can simply use Tiny RCU, -code size is only one aspect of memory efficiency. -Another aspect is the size of the <tt>rcu_head</tt> structure -used by <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>. -Although this structure contains nothing more than a pair of pointers, -it does appear in many RCU-protected data structures, including -some that are size critical. -The <tt>page</tt> structure is a case in point, as evidenced by -the many occurrences of the <tt>union</tt> keyword within that structure. - -<p> -This need for memory efficiency is one reason that RCU uses hand-crafted -singly linked lists to track the <tt>rcu_head</tt> structures that -are waiting for a grace period to elapse. -It is also the reason why <tt>rcu_head</tt> structures do not contain -debug information, such as fields tracking the file and line of the -<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> that posted them. -Although this information might appear in debug-only kernel builds at some -point, in the meantime, the <tt>->func</tt> field will often provide -the needed debug information. - -<p> -However, in some cases, the need for memory efficiency leads to even -more extreme measures. -Returning to the <tt>page</tt> structure, the <tt>rcu_head</tt> field -shares storage with a great many other structures that are used at -various points in the corresponding page's lifetime. -In order to correctly resolve certain -<a href="https://lkml.kernel.org/g/1439976106-137226-1-git-send-email-kirill.shutemov@linux.intel.com">race conditions</a>, -the Linux kernel's memory-management subsystem needs a particular bit -to remain zero during all phases of grace-period processing, -and that bit happens to map to the bottom bit of the -<tt>rcu_head</tt> structure's <tt>->next</tt> field. -RCU makes this guarantee as long as <tt>call_rcu()</tt> -is used to post the callback, as opposed to <tt>kfree_rcu()</tt> -or some future “lazy” -variant of <tt>call_rcu()</tt> that might one day be created for -energy-efficiency purposes. - -<h3><a name="Performance, Scalability, Response Time, and Reliability"> -Performance, Scalability, Response Time, and Reliability</a></h3> - -<p> -Expanding on the -<a href="#Performance and Scalability">earlier discussion</a>, -RCU is used heavily by hot code paths in performance-critical -portions of the Linux kernel's networking, security, virtualization, -and scheduling code paths. -RCU must therefore use efficient implementations, especially in its -read-side primitives. -To that end, it would be good if preemptible RCU's implementation -of <tt>rcu_read_lock()</tt> could be inlined, however, doing -this requires resolving <tt>#include</tt> issues with the -<tt>task_struct</tt> structure. - -<p> -The Linux kernel supports hardware configurations with up to -4096 CPUs, which means that RCU must be extremely scalable. -Algorithms that involve frequent acquisitions of global locks or -frequent atomic operations on global variables simply cannot be -tolerated within the RCU implementation. -RCU therefore makes heavy use of a combining tree based on the -<tt>rcu_node</tt> structure. -RCU is required to tolerate all CPUs continuously invoking any -combination of RCU's runtime primitives with minimal per-operation -overhead. -In fact, in many cases, increasing load must <i>decrease</i> the -per-operation overhead, witness the batching optimizations for -<tt>synchronize_rcu()</tt>, <tt>call_rcu()</tt>, -<tt>synchronize_rcu_expedited()</tt>, and <tt>rcu_barrier()</tt>. -As a general rule, RCU must cheerfully accept whatever the -rest of the Linux kernel decides to throw at it. - -<p> -The Linux kernel is used for real-time workloads, especially -in conjunction with the -<a href="https://rt.wiki.kernel.org/index.php/Main_Page">-rt patchset</a>. -The real-time-latency response requirements are such that the -traditional approach of disabling preemption across RCU -read-side critical sections is inappropriate. -Kernels built with <tt>CONFIG_PREEMPT=y</tt> therefore -use an RCU implementation that allows RCU read-side critical -sections to be preempted. -This requirement made its presence known after users made it -clear that an earlier -<a href="https://lwn.net/Articles/107930/">real-time patch</a> -did not meet their needs, in conjunction with some -<a href="https://lkml.kernel.org/g/20050318002026.GA2693@us.ibm.com">RCU issues</a> -encountered by a very early version of the -rt patchset. - -<p> -In addition, RCU must make do with a sub-100-microsecond real-time latency -budget. -In fact, on smaller systems with the -rt patchset, the Linux kernel -provides sub-20-microsecond real-time latencies for the whole kernel, -including RCU. -RCU's scalability and latency must therefore be sufficient for -these sorts of configurations. -To my surprise, the sub-100-microsecond real-time latency budget -<a href="http://www.rdrop.com/users/paulmck/realtime/paper/bigrt.2013.01.31a.LCA.pdf"> -applies to even the largest systems [PDF]</a>, -up to and including systems with 4096 CPUs. -This real-time requirement motivated the grace-period kthread, which -also simplified handling of a number of race conditions. - -<p> -Finally, RCU's status as a synchronization primitive means that -any RCU failure can result in arbitrary memory corruption that can be -extremely difficult to debug. -This means that RCU must be extremely reliable, which in -practice also means that RCU must have an aggressive stress-test -suite. -This stress-test suite is called <tt>rcutorture</tt>. - -<p> -Although the need for <tt>rcutorture</tt> was no surprise, -the current immense popularity of the Linux kernel is posing -interesting—and perhaps unprecedented—validation -challenges. -To see this, keep in mind that there are well over one billion -instances of the Linux kernel running today, given Android -smartphones, Linux-powered televisions, and servers. -This number can be expected to increase sharply with the advent of -the celebrated Internet of Things. - -<p> -Suppose that RCU contains a race condition that manifests on average -once per million years of runtime. -This bug will be occurring about three times per <i>day</i> across -the installed base. -RCU could simply hide behind hardware error rates, given that no one -should really expect their smartphone to last for a million years. -However, anyone taking too much comfort from this thought should -consider the fact that in most jurisdictions, a successful multi-year -test of a given mechanism, which might include a Linux kernel, -suffices for a number of types of safety-critical certifications. -In fact, rumor has it that the Linux kernel is already being used -in production for safety-critical applications. -I don't know about you, but I would feel quite bad if a bug in RCU -killed someone. -Which might explain my recent focus on validation and verification. - -<h2><a name="Other RCU Flavors">Other RCU Flavors</a></h2> - -<p> -One of the more surprising things about RCU is that there are now -no fewer than five <i>flavors</i>, or API families. -In addition, the primary flavor that has been the sole focus up to -this point has two different implementations, non-preemptible and -preemptible. -The other four flavors are listed below, with requirements for each -described in a separate section. - -<ol> -<li> <a href="#Bottom-Half Flavor">Bottom-Half Flavor</a> -<li> <a href="#Sched Flavor">Sched Flavor</a> -<li> <a href="#Sleepable RCU">Sleepable RCU</a> -<li> <a href="#Tasks RCU">Tasks RCU</a> -</ol> - -<h3><a name="Bottom-Half Flavor">Bottom-Half Flavor</a></h3> - -<p> -The softirq-disable (AKA “bottom-half”, -hence the “_bh” abbreviations) -flavor of RCU, or <i>RCU-bh</i>, was developed by -Dipankar Sarma to provide a flavor of RCU that could withstand the -network-based denial-of-service attacks researched by Robert -Olsson. -These attacks placed so much networking load on the system -that some of the CPUs never exited softirq execution, -which in turn prevented those CPUs from ever executing a context switch, -which, in the RCU implementation of that time, prevented grace periods -from ever ending. -The result was an out-of-memory condition and a system hang. - -<p> -The solution was the creation of RCU-bh, which does -<tt>local_bh_disable()</tt> -across its read-side critical sections, and which uses the transition -from one type of softirq processing to another as a quiescent state -in addition to context switch, idle, user mode, and offline. -This means that RCU-bh grace periods can complete even when some of -the CPUs execute in softirq indefinitely, thus allowing algorithms -based on RCU-bh to withstand network-based denial-of-service attacks. - -<p> -Because -<tt>rcu_read_lock_bh()</tt> and <tt>rcu_read_unlock_bh()</tt> -disable and re-enable softirq handlers, any attempt to start a softirq -handlers during the -RCU-bh read-side critical section will be deferred. -In this case, <tt>rcu_read_unlock_bh()</tt> -will invoke softirq processing, which can take considerable time. -One can of course argue that this softirq overhead should be associated -with the code following the RCU-bh read-side critical section rather -than <tt>rcu_read_unlock_bh()</tt>, but the fact -is that most profiling tools cannot be expected to make this sort -of fine distinction. -For example, suppose that a three-millisecond-long RCU-bh read-side -critical section executes during a time of heavy networking load. -There will very likely be an attempt to invoke at least one softirq -handler during that three milliseconds, but any such invocation will -be delayed until the time of the <tt>rcu_read_unlock_bh()</tt>. -This can of course make it appear at first glance as if -<tt>rcu_read_unlock_bh()</tt> was executing very slowly. - -<p> -The -<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-bh API</a> -includes -<tt>rcu_read_lock_bh()</tt>, -<tt>rcu_read_unlock_bh()</tt>, -<tt>rcu_dereference_bh()</tt>, -<tt>rcu_dereference_bh_check()</tt>, -<tt>synchronize_rcu_bh()</tt>, -<tt>synchronize_rcu_bh_expedited()</tt>, -<tt>call_rcu_bh()</tt>, -<tt>rcu_barrier_bh()</tt>, and -<tt>rcu_read_lock_bh_held()</tt>. - -<h3><a name="Sched Flavor">Sched Flavor</a></h3> - -<p> -Before preemptible RCU, waiting for an RCU grace period had the -side effect of also waiting for all pre-existing interrupt -and NMI handlers. -However, there are legitimate preemptible-RCU implementations that -do not have this property, given that any point in the code outside -of an RCU read-side critical section can be a quiescent state. -Therefore, <i>RCU-sched</i> was created, which follows “classic” -RCU in that an RCU-sched grace period waits for for pre-existing -interrupt and NMI handlers. -In kernels built with <tt>CONFIG_PREEMPT=n</tt>, the RCU and RCU-sched -APIs have identical implementations, while kernels built with -<tt>CONFIG_PREEMPT=y</tt> provide a separate implementation for each. - -<p> -Note well that in <tt>CONFIG_PREEMPT=y</tt> kernels, -<tt>rcu_read_lock_sched()</tt> and <tt>rcu_read_unlock_sched()</tt> -disable and re-enable preemption, respectively. -This means that if there was a preemption attempt during the -RCU-sched read-side critical section, <tt>rcu_read_unlock_sched()</tt> -will enter the scheduler, with all the latency and overhead entailed. -Just as with <tt>rcu_read_unlock_bh()</tt>, this can make it look -as if <tt>rcu_read_unlock_sched()</tt> was executing very slowly. -However, the highest-priority task won't be preempted, so that task -will enjoy low-overhead <tt>rcu_read_unlock_sched()</tt> invocations. - -<p> -The -<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-sched API</a> -includes -<tt>rcu_read_lock_sched()</tt>, -<tt>rcu_read_unlock_sched()</tt>, -<tt>rcu_read_lock_sched_notrace()</tt>, -<tt>rcu_read_unlock_sched_notrace()</tt>, -<tt>rcu_dereference_sched()</tt>, -<tt>rcu_dereference_sched_check()</tt>, -<tt>synchronize_sched()</tt>, -<tt>synchronize_rcu_sched_expedited()</tt>, -<tt>call_rcu_sched()</tt>, -<tt>rcu_barrier_sched()</tt>, and -<tt>rcu_read_lock_sched_held()</tt>. -However, anything that disables preemption also marks an RCU-sched -read-side critical section, including -<tt>preempt_disable()</tt> and <tt>preempt_enable()</tt>, -<tt>local_irq_save()</tt> and <tt>local_irq_restore()</tt>, -and so on. - -<h3><a name="Sleepable RCU">Sleepable RCU</a></h3> - -<p> -For well over a decade, someone saying “I need to block within -an RCU read-side critical section” was a reliable indication -that this someone did not understand RCU. -After all, if you are always blocking in an RCU read-side critical -section, you can probably afford to use a higher-overhead synchronization -mechanism. -However, that changed with the advent of the Linux kernel's notifiers, -whose RCU read-side critical -sections almost never sleep, but sometimes need to. -This resulted in the introduction of -<a href="https://lwn.net/Articles/202847/">sleepable RCU</a>, -or <i>SRCU</i>. - -<p> -SRCU allows different domains to be defined, with each such domain -defined by an instance of an <tt>srcu_struct</tt> structure. -A pointer to this structure must be passed in to each SRCU function, -for example, <tt>synchronize_srcu(&ss)</tt>, where -<tt>ss</tt> is the <tt>srcu_struct</tt> structure. -The key benefit of these domains is that a slow SRCU reader in one -domain does not delay an SRCU grace period in some other domain. -That said, one consequence of these domains is that read-side code -must pass a “cookie” from <tt>srcu_read_lock()</tt> -to <tt>srcu_read_unlock()</tt>, for example, as follows: - -<blockquote> -<pre> - 1 int idx; - 2 - 3 idx = srcu_read_lock(&ss); - 4 do_something(); - 5 srcu_read_unlock(&ss, idx); -</pre> -</blockquote> - -<p> -As noted above, it is legal to block within SRCU read-side critical sections, -however, with great power comes great responsibility. -If you block forever in one of a given domain's SRCU read-side critical -sections, then that domain's grace periods will also be blocked forever. -Of course, one good way to block forever is to deadlock, which can -happen if any operation in a given domain's SRCU read-side critical -section can block waiting, either directly or indirectly, for that domain's -grace period to elapse. -For example, this results in a self-deadlock: - -<blockquote> -<pre> - 1 int idx; - 2 - 3 idx = srcu_read_lock(&ss); - 4 do_something(); - 5 synchronize_srcu(&ss); - 6 srcu_read_unlock(&ss, idx); -</pre> -</blockquote> - -<p> -However, if line 5 acquired a mutex that was held across -a <tt>synchronize_srcu()</tt> for domain <tt>ss</tt>, -deadlock would still be possible. -Furthermore, if line 5 acquired a mutex that was held across -a <tt>synchronize_srcu()</tt> for some other domain <tt>ss1</tt>, -and if an <tt>ss1</tt>-domain SRCU read-side critical section -acquired another mutex that was held across as <tt>ss</tt>-domain -<tt>synchronize_srcu()</tt>, -deadlock would again be possible. -Such a deadlock cycle could extend across an arbitrarily large number -of different SRCU domains. -Again, with great power comes great responsibility. - -<p> -Unlike the other RCU flavors, SRCU read-side critical sections can -run on idle and even offline CPUs. -This ability requires that <tt>srcu_read_lock()</tt> and -<tt>srcu_read_unlock()</tt> contain memory barriers, which means -that SRCU readers will run a bit slower than would RCU readers. -It also motivates the <tt>smp_mb__after_srcu_read_unlock()</tt> -API, which, in combination with <tt>srcu_read_unlock()</tt>, -guarantees a full memory barrier. - -<p> -The -<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a> -includes -<tt>srcu_read_lock()</tt>, -<tt>srcu_read_unlock()</tt>, -<tt>srcu_dereference()</tt>, -<tt>srcu_dereference_check()</tt>, -<tt>synchronize_srcu()</tt>, -<tt>synchronize_srcu_expedited()</tt>, -<tt>call_srcu()</tt>, -<tt>srcu_barrier()</tt>, and -<tt>srcu_read_lock_held()</tt>. -It also includes -<tt>DEFINE_SRCU()</tt>, -<tt>DEFINE_STATIC_SRCU()</tt>, and -<tt>init_srcu_struct()</tt> -APIs for defining and initializing <tt>srcu_struct</tt> structures. - -<h3><a name="Tasks RCU">Tasks RCU</a></h3> - -<p> -Some forms of tracing use “tramopolines” to handle the -binary rewriting required to install different types of probes. -It would be good to be able to free old trampolines, which sounds -like a job for some form of RCU. -However, because it is necessary to be able to install a trace -anywhere in the code, it is not possible to use read-side markers -such as <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>. -In addition, it does not work to have these markers in the trampoline -itself, because there would need to be instructions following -<tt>rcu_read_unlock()</tt>. -Although <tt>synchronize_rcu()</tt> would guarantee that execution -reached the <tt>rcu_read_unlock()</tt>, it would not be able to -guarantee that execution had completely left the trampoline. - -<p> -The solution, in the form of -<a href="https://lwn.net/Articles/607117/"><i>Tasks RCU</i></a>, -is to have implicit -read-side critical sections that are delimited by voluntary context -switches, that is, calls to <tt>schedule()</tt>, -<tt>cond_resched_rcu_qs()</tt>, and -<tt>synchronize_rcu_tasks()</tt>. -In addition, transitions to and from userspace execution also delimit -tasks-RCU read-side critical sections. - -<p> -The tasks-RCU API is quite compact, consisting only of -<tt>call_rcu_tasks()</tt>, -<tt>synchronize_rcu_tasks()</tt>, and -<tt>rcu_barrier_tasks()</tt>. - -<h2><a name="Possible Future Changes">Possible Future Changes</a></h2> - -<p> -One of the tricks that RCU uses to attain update-side scalability is -to increase grace-period latency with increasing numbers of CPUs. -If this becomes a serious problem, it will be necessary to rework the -grace-period state machine so as to avoid the need for the additional -latency. - -<p> -Expedited grace periods scan the CPUs, so their latency and overhead -increases with increasing numbers of CPUs. -If this becomes a serious problem on large systems, it will be necessary -to do some redesign to avoid this scalability problem. - -<p> -RCU disables CPU hotplug in a few places, perhaps most notably in the -expedited grace-period and <tt>rcu_barrier()</tt> operations. -If there is a strong reason to use expedited grace periods in CPU-hotplug -notifiers, it will be necessary to avoid disabling CPU hotplug. -This would introduce some complexity, so there had better be a <i>very</i> -good reason. - -<p> -The tradeoff between grace-period latency on the one hand and interruptions -of other CPUs on the other hand may need to be re-examined. -The desire is of course for zero grace-period latency as well as zero -interprocessor interrupts undertaken during an expedited grace period -operation. -While this ideal is unlikely to be achievable, it is quite possible that -further improvements can be made. - -<p> -The multiprocessor implementations of RCU use a combining tree that -groups CPUs so as to reduce lock contention and increase cache locality. -However, this combining tree does not spread its memory across NUMA -nodes nor does it align the CPU groups with hardware features such -as sockets or cores. -Such spreading and alignment is currently believed to be unnecessary -because the hotpath read-side primitives do not access the combining -tree, nor does <tt>call_rcu()</tt> in the common case. -If you believe that your architecture needs such spreading and alignment, -then your architecture should also benefit from the -<tt>rcutree.rcu_fanout_leaf</tt> boot parameter, which can be set -to the number of CPUs in a socket, NUMA node, or whatever. -If the number of CPUs is too large, use a fraction of the number of -CPUs. -If the number of CPUs is a large prime number, well, that certainly -is an “interesting” architectural choice! -More flexible arrangements might be considered, but only if -<tt>rcutree.rcu_fanout_leaf</tt> has proven inadequate, and only -if the inadequacy has been demonstrated by a carefully run and -realistic system-level workload. - -<p> -Please note that arrangements that require RCU to remap CPU numbers will -require extremely good demonstration of need and full exploration of -alternatives. - -<p> -There is an embarrassingly large number of flavors of RCU, and this -number has been increasing over time. -Perhaps it will be possible to combine some at some future date. - -<p> -RCU's various kthreads are reasonably recent additions. -It is quite likely that adjustments will be required to more gracefully -handle extreme loads. -It might also be necessary to be able to relate CPU utilization by -RCU's kthreads and softirq handlers to the code that instigated this -CPU utilization. -For example, RCU callback overhead might be charged back to the -originating <tt>call_rcu()</tt> instance, though probably not -in production kernels. - -<h2><a name="Summary">Summary</a></h2> - -<p> -This document has presented more than two decade's worth of RCU -requirements. -Given that the requirements keep changing, this will not be the last -word on this subject, but at least it serves to get an important -subset of the requirements set forth. - -<h2><a name="Acknowledgments">Acknowledgments</a></h2> - -I am grateful to Steven Rostedt, Lai Jiangshan, Ingo Molnar, -Oleg Nesterov, Borislav Petkov, Peter Zijlstra, Boqun Feng, and -Andy Lutomirski for their help in rendering -this article human readable, and to Michelle Rankin for her support -of this effort. -Other contributions are acknowledged in the Linux kernel's git archive. -The cartoon is copyright (c) 2013 by Melissa Broussard, -and is provided -under the terms of the Creative Commons Attribution-Share Alike 3.0 -United States license. - -<p>@@QQAL@@ - -</body></html> diff --git a/Documentation/RCU/Design/htmlqqz.sh b/Documentation/RCU/Design/htmlqqz.sh deleted file mode 100755 index d354f069559b..000000000000 --- a/Documentation/RCU/Design/htmlqqz.sh +++ /dev/null @@ -1,108 +0,0 @@ -#!/bin/sh -# -# Usage: sh htmlqqz.sh file -# -# Extracts and converts quick quizzes in a proto-HTML document file.htmlx. -# Commands, all of which must be on a line by themselves: -# -# "<p>@@QQ@@": Start of a quick quiz. -# "<p>@@QQA@@": Start of a quick-quiz answer. -# "<p>@@QQE@@": End of a quick-quiz answer, and thus of the quick quiz. -# "<p>@@QQAL@@": Place to put quick-quiz answer list. -# -# Places the result in file.html. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, you can access it online at -# http://www.gnu.org/licenses/gpl-2.0.html. -# -# Copyright (c) 2013 Paul E. McKenney, IBM Corporation. - -fn=$1 -if test ! -r $fn.htmlx -then - echo "Error: $fn.htmlx unreadable." - exit 1 -fi - -echo "<!-- DO NOT HAND EDIT. -->" > $fn.html -echo "<!-- Instead, edit $fn.htmlx and run 'sh htmlqqz.sh $fn' -->" >> $fn.html -awk < $fn.htmlx >> $fn.html ' - -state == "" && $1 != "<p>@@QQ@@" && $1 != "<p>@@QQAL@@" { - print $0; - if ($0 ~ /^<p>@@QQ/) - print "Bad Quick Quiz command: " NR " (expected <p>@@QQ@@ or <p>@@QQAL@@)." > "/dev/stderr" - next; -} - -state == "" && $1 == "<p>@@QQ@@" { - qqn++; - qqlineno = NR; - haveqq = 1; - state = "qq"; - print "<p><a name=\"Quick Quiz " qqn "\"><b>Quick Quiz " qqn "</b>:</a>" - next; -} - -state == "qq" && $1 != "<p>@@QQA@@" { - qq[qqn] = qq[qqn] $0 "\n"; - print $0 - if ($0 ~ /^<p>@@QQ/) - print "Bad Quick Quiz command: " NR ". (expected <p>@@QQA@@)" > "/dev/stderr" - next; -} - -state == "qq" && $1 == "<p>@@QQA@@" { - state = "qqa"; - print "<br><a href=\"#qq" qqn "answer\">Answer</a>" - next; -} - -state == "qqa" && $1 != "<p>@@QQE@@" { - qqa[qqn] = qqa[qqn] $0 "\n"; - if ($0 ~ /^<p>@@QQ/) - print "Bad Quick Quiz command: " NR " (expected <p>@@QQE@@)." > "/dev/stderr" - next; -} - -state == "qqa" && $1 == "<p>@@QQE@@" { - state = ""; - next; -} - -state == "" && $1 == "<p>@@QQAL@@" { - haveqq = ""; - print "<h3><a name=\"Answers to Quick Quizzes\">" - print "Answers to Quick Quizzes</a></h3>" - print ""; - for (i = 1; i <= qqn; i++) { - print "<a name=\"qq" i "answer\"></a>" - print "<p><b>Quick Quiz " i "</b>:" - print qq[i]; - print ""; - print "</p><p><b>Answer</b>:" - print qqa[i]; - print ""; - print "</p><p><a href=\"#Quick%20Quiz%20" i "\"><b>Back to Quick Quiz " i "</b>.</a>" - print ""; - } - next; -} - -END { - if (state != "") - print "Unterminated Quick Quiz: " qqlineno "." > "/dev/stderr" - else if (haveqq) - print "Missing \"<p>@@QQAL@@\", no Quick Quiz." > "/dev/stderr" -}' diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index ec6998b1b6d0..00a3a38b375a 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -237,17 +237,17 @@ o "ktl" is the low-order 16 bits (in hexadecimal) of the count of The output of "cat rcu/rcu_preempt/rcuexp" looks as follows: -s=21872 wd0=0 wd1=0 wd2=0 wd3=5 n=0 enq=0 sc=21872 +s=21872 wd1=0 wd2=0 wd3=5 n=0 enq=0 sc=21872 These fields are as follows: o "s" is the sequence number, with an odd number indicating that an expedited grace period is in progress. -o "wd0", "wd1", "wd2", and "wd3" are the number of times that an - attempt to start an expedited grace period found that someone - else had completed an expedited grace period that satisfies the - attempted request. "Our work is done." +o "wd1", "wd2", and "wd3" are the number of times that an attempt + to start an expedited grace period found that someone else had + completed an expedited grace period that satisfies the attempted + request. "Our work is done." o "n" is number of times that a concurrent CPU-hotplug operation forced a fallback to a normal grace period. diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index dc49c6712b17..111770ffa10e 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -681,22 +681,30 @@ Although RCU can be used in many different ways, a very common use of RCU is analogous to reader-writer locking. The following unified diff shows how closely related RCU and reader-writer locking can be. + @@ -5,5 +5,5 @@ struct el { + int data; + /* Other data fields */ + }; + -rwlock_t listmutex; + +spinlock_t listmutex; + struct el head; + @@ -13,15 +14,15 @@ struct list_head *lp; struct el *p; - - read_lock(); + - read_lock(&listmutex); - list_for_each_entry(p, head, lp) { + rcu_read_lock(); + list_for_each_entry_rcu(p, head, lp) { if (p->key == key) { *result = p->data; - - read_unlock(); + - read_unlock(&listmutex); + rcu_read_unlock(); return 1; } } - - read_unlock(); + - read_unlock(&listmutex); + rcu_read_unlock(); return 0; } @@ -732,7 +740,7 @@ Or, for those who prefer a side-by-side listing: 5 int data; 5 int data; 6 /* Other data fields */ 6 /* Other data fields */ 7 }; 7 }; - 8 spinlock_t listmutex; 8 spinlock_t listmutex; + 8 rwlock_t listmutex; 8 spinlock_t listmutex; 9 struct el head; 9 struct el head; 1 int search(long key, int *result) 1 int search(long key, int *result) @@ -740,15 +748,15 @@ Or, for those who prefer a side-by-side listing: 3 struct list_head *lp; 3 struct list_head *lp; 4 struct el *p; 4 struct el *p; 5 5 - 6 read_lock(); 6 rcu_read_lock(); + 6 read_lock(&listmutex); 6 rcu_read_lock(); 7 list_for_each_entry(p, head, lp) { 7 list_for_each_entry_rcu(p, head, lp) { 8 if (p->key == key) { 8 if (p->key == key) { 9 *result = p->data; 9 *result = p->data; -10 read_unlock(); 10 rcu_read_unlock(); +10 read_unlock(&listmutex); 10 rcu_read_unlock(); 11 return 1; 11 return 1; 12 } 12 } 13 } 13 } -14 read_unlock(); 14 rcu_read_unlock(); +14 read_unlock(&listmutex); 14 rcu_read_unlock(); 15 return 0; 15 return 0; 16 } 16 } diff --git a/Documentation/devicetree/bindings/regmap/regmap.txt b/Documentation/devicetree/bindings/regmap/regmap.txt index e98a9652ccc8..0127be360fe8 100644 --- a/Documentation/devicetree/bindings/regmap/regmap.txt +++ b/Documentation/devicetree/bindings/regmap/regmap.txt @@ -1,50 +1,29 @@ -Device-Tree binding for regmap - -The endianness mode of CPU & Device scenarios: -Index Device Endianness properties ---------------------------------------------------- -1 BE 'big-endian' -2 LE 'little-endian' -3 Native 'native-endian' - -For one device driver, which will run in different scenarios above -on different SoCs using the devicetree, we need one way to simplify -this. +Devicetree binding for regmap Optional properties: -- {big,little,native}-endian: these are boolean properties, if absent - then the implementation will choose a default based on the device - being controlled. These properties are for register values and all - the buffers only. Native endian means that the CPU and device have - the same endianness. -Examples: -Scenario 1 : CPU in LE mode & device in LE mode. -dev: dev@40031000 { - compatible = "name"; - reg = <0x40031000 0x1000>; - ... -}; + little-endian, + big-endian, + native-endian: See common-properties.txt for a definition -Scenario 2 : CPU in LE mode & device in BE mode. -dev: dev@40031000 { - compatible = "name"; - reg = <0x40031000 0x1000>; - ... - big-endian; -}; +Note: +Regmap defaults to little-endian register access on MMIO based +devices, this is by far the most common setting. On CPU +architectures that typically run big-endian operating systems +(e.g. PowerPC), registers can be defined as big-endian and must +be marked that way in the devicetree. -Scenario 3 : CPU in BE mode & device in BE mode. -dev: dev@40031000 { - compatible = "name"; - reg = <0x40031000 0x1000>; - ... -}; +On SoCs that can be operated in both big-endian and little-endian +modes, with a single hardware switch controlling both the endianess +of the CPU and a byteswap for MMIO registers (e.g. many Broadcom MIPS +chips), "native-endian" is used to allow using the same device tree +blob in both cases. -Scenario 4 : CPU in BE mode & device in LE mode. +Examples: +Scenario 1 : a register set in big-endian mode. dev: dev@40031000 { - compatible = "name"; + compatible = "syscon"; reg = <0x40031000 0x1000>; + big-endian; ... - little-endian; }; diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 0b3de80ec8f6..49673bd30b87 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3284,6 +3284,44 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Lazy RCU callbacks are those which RCU can prove do nothing more than free memory. + rcuperf.gp_exp= [KNL] + Measure performance of expedited synchronous + grace-period primitives. + + rcuperf.holdoff= [KNL] + Set test-start holdoff period. The purpose of + this parameter is to delay the start of the + test until boot completes in order to avoid + interference. + + rcuperf.nreaders= [KNL] + Set number of RCU readers. The value -1 selects + N, where N is the number of CPUs. A value + "n" less than -1 selects N-n+1, where N is again + the number of CPUs. For example, -2 selects N + (the number of CPUs), -3 selects N+1, and so on. + A value of "n" less than or equal to -N selects + a single reader. + + rcuperf.nwriters= [KNL] + Set number of RCU writers. The values operate + the same as for rcuperf.nreaders. + N, where N is the number of CPUs + + rcuperf.perf_runnable= [BOOT] + Start rcuperf running at boot time. + + rcuperf.shutdown= [KNL] + Shut the system down after performance tests + complete. This is useful for hands-off automated + testing. + + rcuperf.perf_type= [KNL] + Specify the RCU implementation to test. + + rcuperf.verbose= [KNL] + Enable additional printk() statements. + rcutorture.cbflood_inter_holdoff= [KNL] Set holdoff time (jiffies) between successive callback-flood tests. diff --git a/Documentation/locking/lockdep-design.txt b/Documentation/locking/lockdep-design.txt index 5001280e9d82..9de1c158d44c 100644 --- a/Documentation/locking/lockdep-design.txt +++ b/Documentation/locking/lockdep-design.txt @@ -97,7 +97,7 @@ between any two lock-classes: <hardirq-safe> -> <hardirq-unsafe> <softirq-safe> -> <softirq-unsafe> -The first rule comes from the fact the a hardirq-safe lock could be +The first rule comes from the fact that a hardirq-safe lock could be taken by a hardirq context, interrupting a hardirq-unsafe lock - and thus could result in a lock inversion deadlock. Likewise, a softirq-safe lock could be taken by an softirq context, interrupting a softirq-unsafe @@ -220,7 +220,7 @@ calculated, which hash is unique for every lock chain. The hash value, when the chain is validated for the first time, is then put into a hash table, which hash-table can be checked in a lockfree manner. If the locking chain occurs again later on, the hash table tells us that we -dont have to validate the chain again. +don't have to validate the chain again. Troubleshooting: ---------------- diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 3729cbe60e41..147ae8ec836f 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -4,8 +4,40 @@ By: David Howells <dhowells@redhat.com> Paul E. McKenney <paulmck@linux.vnet.ibm.com> + Will Deacon <will.deacon@arm.com> + Peter Zijlstra <peterz@infradead.org> -Contents: +========== +DISCLAIMER +========== + +This document is not a specification; it is intentionally (for the sake of +brevity) and unintentionally (due to being human) incomplete. This document is +meant as a guide to using the various memory barriers provided by Linux, but +in case of any doubt (and there are many) please ask. + +To repeat, this document is not a specification of what Linux expects from +hardware. + +The purpose of this document is twofold: + + (1) to specify the minimum functionality that one can rely on for any + particular barrier, and + + (2) to provide a guide as to how to use the barriers that are available. + +Note that an architecture can provide more than the minimum requirement +for any particular barrier, but if the architecure provides less than +that, that architecture is incorrect. + +Note also that it is possible that a barrier may be a no-op for an +architecture because the way that arch works renders an explicit barrier +unnecessary in that case. + + +======== +CONTENTS +======== (*) Abstract memory access model. @@ -31,15 +63,15 @@ Contents: (*) Implicit kernel memory barriers. - - Locking functions. + - Lock acquisition functions. - Interrupt disabling functions. - Sleep and wake-up functions. - Miscellaneous functions. - (*) Inter-CPU locking barrier effects. + (*) Inter-CPU acquiring barrier effects. - - Locks vs memory accesses. - - Locks vs I/O accesses. + - Acquires vs memory accesses. + - Acquires vs I/O accesses. (*) Where are memory barriers needed? @@ -61,6 +93,7 @@ Contents: (*) The things CPUs get up to. - And then there's the Alpha. + - Virtual Machine Guests. (*) Example uses. @@ -148,7 +181,7 @@ As a further example, consider this sequence of events: CPU 1 CPU 2 =============== =============== - { A == 1, B == 2, C = 3, P == &A, Q == &C } + { A == 1, B == 2, C == 3, P == &A, Q == &C } B = 4; Q = P; P = &B D = *Q; @@ -430,8 +463,9 @@ And a couple of implicit varieties: This acts as a one-way permeable barrier. It guarantees that all memory operations after the ACQUIRE operation will appear to happen after the ACQUIRE operation with respect to the other components of the system. - ACQUIRE operations include LOCK operations and smp_load_acquire() - operations. + ACQUIRE operations include LOCK operations and both smp_load_acquire() + and smp_cond_acquire() operations. The later builds the necessary ACQUIRE + semantics from relying on a control dependency and smp_rmb(). Memory operations that occur before an ACQUIRE operation may appear to happen after it completes. @@ -464,6 +498,11 @@ And a couple of implicit varieties: This means that ACQUIRE acts as a minimal "acquire" operation and RELEASE acts as a minimal "release" operation. +A subset of the atomic operations described in atomic_ops.txt have ACQUIRE +and RELEASE variants in addition to fully-ordered and relaxed (no barrier +semantics) definitions. For compound atomics performing both a load and a +store, ACQUIRE semantics apply only to the load and RELEASE semantics apply +only to the store portion of the operation. Memory barriers are only required where there's a possibility of interaction between two CPUs or between a CPU and a device. If it can be guaranteed that @@ -517,7 +556,7 @@ following sequence of events: CPU 1 CPU 2 =============== =============== - { A == 1, B == 2, C = 3, P == &A, Q == &C } + { A == 1, B == 2, C == 3, P == &A, Q == &C } B = 4; <write barrier> WRITE_ONCE(P, &B) @@ -544,7 +583,7 @@ between the address load and the data load: CPU 1 CPU 2 =============== =============== - { A == 1, B == 2, C = 3, P == &A, Q == &C } + { A == 1, B == 2, C == 3, P == &A, Q == &C } B = 4; <write barrier> WRITE_ONCE(P, &B); @@ -813,9 +852,10 @@ In summary: the same variable, then those stores must be ordered, either by preceding both of them with smp_mb() or by using smp_store_release() to carry out the stores. Please note that it is -not- sufficient - to use barrier() at beginning of each leg of the "if" statement, - as optimizing compilers do not necessarily respect barrier() - in this case. + to use barrier() at beginning of each leg of the "if" statement + because, as shown by the example above, optimizing compilers can + destroy the control dependency while respecting the letter of the + barrier() law. (*) Control dependencies require at least one run-time conditional between the prior load and the subsequent store, and this @@ -1731,15 +1771,15 @@ The Linux kernel has eight basic CPU memory barriers: All memory barriers except the data dependency barriers imply a compiler -barrier. Data dependencies do not impose any additional compiler ordering. +barrier. Data dependencies do not impose any additional compiler ordering. Aside: In the case of data dependencies, the compiler would be expected to issue the loads in the correct order (eg. `a[b]` would have to load the value of b before loading a[b]), however there is no guarantee in the C specification that the compiler may not speculate the value of b (eg. is equal to 1) and load a before b (eg. tmp = a[1]; if (b != 1) -tmp = a[b]; ). There is also the problem of a compiler reloading b after -having loaded a[b], thus having a newer copy of b than a[b]. A consensus +tmp = a[b]; ). There is also the problem of a compiler reloading b after +having loaded a[b], thus having a newer copy of b than a[b]. A consensus has not yet been reached about these problems, however the READ_ONCE() macro is a good place to start looking. @@ -1794,6 +1834,7 @@ There are some more advanced barrier functions: (*) lockless_dereference(); + This can be thought of as a pointer-fetch wrapper around the smp_read_barrier_depends() data-dependency barrier. @@ -1858,7 +1899,7 @@ This is a variation on the mandatory write barrier that causes writes to weakly ordered I/O regions to be partially ordered. Its effects may go beyond the CPU->Hardware interface and actually affect the hardware at some level. -See the subsection "Locks vs I/O accesses" for more information. +See the subsection "Acquires vs I/O accesses" for more information. =============================== @@ -1873,8 +1914,8 @@ provide more substantial guarantees, but these may not be relied upon outside of arch specific code. -ACQUIRING FUNCTIONS -------------------- +LOCK ACQUISITION FUNCTIONS +-------------------------- The Linux kernel has a number of locking constructs: @@ -1895,7 +1936,7 @@ for each construct. These operations all imply certain barriers: Memory operations issued before the ACQUIRE may be completed after the ACQUIRE operation has completed. An smp_mb__before_spinlock(), combined with a following ACQUIRE, orders prior stores against - subsequent loads and stores. Note that this is weaker than smp_mb()! + subsequent loads and stores. Note that this is weaker than smp_mb()! The smp_mb__before_spinlock() primitive is free on many architectures. (2) RELEASE operation implication: @@ -2090,9 +2131,9 @@ or: event_indicated = 1; wake_up_process(event_daemon); -A write memory barrier is implied by wake_up() and co. if and only if they wake -something up. The barrier occurs before the task state is cleared, and so sits -between the STORE to indicate the event and the STORE to set TASK_RUNNING: +A write memory barrier is implied by wake_up() and co. if and only if they +wake something up. The barrier occurs before the task state is cleared, and so +sits between the STORE to indicate the event and the STORE to set TASK_RUNNING: CPU 1 CPU 2 =============================== =============================== @@ -2206,7 +2247,7 @@ three CPUs; then should the following sequence of events occur: Then there is no guarantee as to what order CPU 3 will see the accesses to *A through *H occur in, other than the constraints imposed by the separate locks -on the separate CPUs. It might, for example, see: +on the separate CPUs. It might, for example, see: *E, ACQUIRE M, ACQUIRE Q, *G, *C, *F, *A, *B, RELEASE Q, *D, *H, RELEASE M @@ -2486,9 +2527,9 @@ The following operations are special locking primitives: clear_bit_unlock(); __clear_bit_unlock(); -These implement ACQUIRE-class and RELEASE-class operations. These should be used in -preference to other operations when implementing locking primitives, because -their implementations can be optimised on many architectures. +These implement ACQUIRE-class and RELEASE-class operations. These should be +used in preference to other operations when implementing locking primitives, +because their implementations can be optimised on many architectures. [!] Note that special memory barrier primitives are available for these situations because on some CPUs the atomic instructions used imply full memory @@ -2568,12 +2609,12 @@ explicit barriers are used. Normally this won't be a problem because the I/O accesses done inside such sections will include synchronous load operations on strictly ordered I/O -registers that form implicit I/O barriers. If this isn't sufficient then an +registers that form implicit I/O barriers. If this isn't sufficient then an mmiowb() may need to be used explicitly. A similar situation may occur between an interrupt routine and two routines -running on separate CPUs that communicate with each other. If such a case is +running on separate CPUs that communicate with each other. If such a case is likely, then interrupt-disabling locks should be used to guarantee ordering. @@ -2587,8 +2628,8 @@ functions: (*) inX(), outX(): These are intended to talk to I/O space rather than memory space, but - that's primarily a CPU-specific concept. The i386 and x86_64 processors do - indeed have special I/O space access cycles and instructions, but many + that's primarily a CPU-specific concept. The i386 and x86_64 processors + do indeed have special I/O space access cycles and instructions, but many CPUs don't have such a concept. The PCI bus, amongst others, defines an I/O space concept which - on such @@ -2610,7 +2651,7 @@ functions: Whether these are guaranteed to be fully ordered and uncombined with respect to each other on the issuing CPU depends on the characteristics - defined for the memory window through which they're accessing. On later + defined for the memory window through which they're accessing. On later i386 architecture machines, for example, this is controlled by way of the MTRR registers. @@ -2635,10 +2676,10 @@ functions: (*) readX_relaxed(), writeX_relaxed() These are similar to readX() and writeX(), but provide weaker memory - ordering guarantees. Specifically, they do not guarantee ordering with + ordering guarantees. Specifically, they do not guarantee ordering with respect to normal memory accesses (e.g. DMA buffers) nor do they guarantee - ordering with respect to LOCK or UNLOCK operations. If the latter is - required, an mmiowb() barrier can be used. Note that relaxed accesses to + ordering with respect to LOCK or UNLOCK operations. If the latter is + required, an mmiowb() barrier can be used. Note that relaxed accesses to the same peripheral are guaranteed to be ordered with respect to each other. @@ -3040,8 +3081,9 @@ The Alpha defines the Linux kernel's memory barrier model. See the subsection on "Cache Coherency" above. + VIRTUAL MACHINE GUESTS -------------------- +---------------------- Guests running within virtual machines might be affected by SMP effects even if the guest itself is compiled without SMP support. This is an artifact of @@ -3050,7 +3092,7 @@ barriers for this use-case would be possible but is often suboptimal. To handle this case optimally, low-level virt_mb() etc macros are available. These have the same effect as smp_mb() etc when SMP is enabled, but generate -identical code for SMP and non-SMP systems. For example, virtual machine guests +identical code for SMP and non-SMP systems. For example, virtual machine guests should use virt_mb() rather than smp_mb() when synchronizing against a (possibly SMP) host. @@ -3058,6 +3100,7 @@ These are equivalent to smp_mb() etc counterparts in all other respects, in particular, they do not control MMIO effects: to control MMIO effects, use mandatory barriers. + ============ EXAMPLE USES ============ @@ -1,7 +1,7 @@ VERSION = 4 PATCHLEVEL = 6 SUBLEVEL = 0 -EXTRAVERSION = -rc7 +EXTRAVERSION = NAME = Charred Weasel # *DOCUMENTATION* diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h index a83bbea62c67..0131a7058778 100644 --- a/arch/alpha/include/asm/rwsem.h +++ b/arch/alpha/include/asm/rwsem.h @@ -63,7 +63,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) return res >= 0 ? 1 : 0; } -static inline void __down_write(struct rw_semaphore *sem) +static inline long ___down_write(struct rw_semaphore *sem) { long oldcount; #ifndef CONFIG_SMP @@ -83,10 +83,24 @@ static inline void __down_write(struct rw_semaphore *sem) :"=&r" (oldcount), "=m" (sem->count), "=&r" (temp) :"Ir" (RWSEM_ACTIVE_WRITE_BIAS), "m" (sem->count) : "memory"); #endif - if (unlikely(oldcount)) + return oldcount; +} + +static inline void __down_write(struct rw_semaphore *sem) +{ + if (unlikely(___down_write(sem))) rwsem_down_write_failed(sem); } +static inline int __down_write_killable(struct rw_semaphore *sem) +{ + if (unlikely(___down_write(sem))) + if (IS_ERR(rwsem_down_write_failed_killable(sem))) + return -EINTR; + + return 0; +} + /* * trylock for writing -- returns 1 if successful, 0 if contention */ diff --git a/arch/arm/boot/dts/at91sam9x5.dtsi b/arch/arm/boot/dts/at91sam9x5.dtsi index 0827d594b1f0..cd0cd5fd09a3 100644 --- a/arch/arm/boot/dts/at91sam9x5.dtsi +++ b/arch/arm/boot/dts/at91sam9x5.dtsi @@ -106,7 +106,7 @@ pmc: pmc@fffffc00 { compatible = "atmel,at91sam9x5-pmc", "syscon"; - reg = <0xfffffc00 0x100>; + reg = <0xfffffc00 0x200>; interrupts = <1 IRQ_TYPE_LEVEL_HIGH 7>; interrupt-controller; #address-cells = <1>; diff --git a/arch/arm/boot/dts/sama5d2.dtsi b/arch/arm/boot/dts/sama5d2.dtsi index 78996bdbd3df..9817090c1b73 100644 --- a/arch/arm/boot/dts/sama5d2.dtsi +++ b/arch/arm/boot/dts/sama5d2.dtsi @@ -280,7 +280,7 @@ status = "disabled"; nfc@c0000000 { - compatible = "atmel,sama5d4-nfc"; + compatible = "atmel,sama5d3-nfc"; #address-cells = <1>; #size-cells = <1>; reg = < /* NFC Command Registers */ diff --git a/arch/arm/include/asm/efi.h b/arch/arm/include/asm/efi.h index e0eea72deb87..a708fa1f0905 100644 --- a/arch/arm/include/asm/efi.h +++ b/arch/arm/include/asm/efi.h @@ -17,34 +17,28 @@ #include <asm/mach/map.h> #include <asm/mmu_context.h> #include <asm/pgtable.h> +#include <asm/ptrace.h> #ifdef CONFIG_EFI void efi_init(void); int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md); +int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md); -#define efi_call_virt(f, ...) \ -({ \ - efi_##f##_t *__f; \ - efi_status_t __s; \ - \ - efi_virtmap_load(); \ - __f = efi.systab->runtime->f; \ - __s = __f(__VA_ARGS__); \ - efi_virtmap_unload(); \ - __s; \ -}) +#define arch_efi_call_virt_setup() efi_virtmap_load() +#define arch_efi_call_virt_teardown() efi_virtmap_unload() -#define __efi_call_virt(f, ...) \ +#define arch_efi_call_virt(f, args...) \ ({ \ efi_##f##_t *__f; \ - \ - efi_virtmap_load(); \ __f = efi.systab->runtime->f; \ - __f(__VA_ARGS__); \ - efi_virtmap_unload(); \ + __f(args); \ }) +#define ARCH_EFI_IRQ_FLAGS_MASK \ + (PSR_J_BIT | PSR_E_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | \ + PSR_T_BIT | MODE_MASK) + static inline void efi_set_pgd(struct mm_struct *mm) { check_and_switch_context(mm, NULL); @@ -59,7 +53,16 @@ void efi_virtmap_unload(void); /* arch specific definitions used by the stub code */ -#define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__) +#define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__) +#define __efi_call_early(f, ...) f(__VA_ARGS__) +#define efi_is_64bit() (false) + +struct screen_info *alloc_screen_info(efi_system_table_t *sys_table_arg); +void free_screen_info(efi_system_table_t *sys_table, struct screen_info *si); + +static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt) +{ +} /* * A reasonable upper bound for the uncompressed kernel size is 32 MBytes, diff --git a/arch/arm/kernel/efi.c b/arch/arm/kernel/efi.c index ff8a9d8acfac..9f43ba012d10 100644 --- a/arch/arm/kernel/efi.c +++ b/arch/arm/kernel/efi.c @@ -11,6 +11,41 @@ #include <asm/mach/map.h> #include <asm/mmu_context.h> +static int __init set_permissions(pte_t *ptep, pgtable_t token, + unsigned long addr, void *data) +{ + efi_memory_desc_t *md = data; + pte_t pte = *ptep; + + if (md->attribute & EFI_MEMORY_RO) + pte = set_pte_bit(pte, __pgprot(L_PTE_RDONLY)); + if (md->attribute & EFI_MEMORY_XP) + pte = set_pte_bit(pte, __pgprot(L_PTE_XN)); + set_pte_ext(ptep, pte, PTE_EXT_NG); + return 0; +} + +int __init efi_set_mapping_permissions(struct mm_struct *mm, + efi_memory_desc_t *md) +{ + unsigned long base, size; + + base = md->virt_addr; + size = md->num_pages << EFI_PAGE_SHIFT; + + /* + * We can only use apply_to_page_range() if we can guarantee that the + * entire region was mapped using pages. This should be the case if the + * region does not cover any naturally aligned SECTION_SIZE sized + * blocks. + */ + if (round_down(base + size, SECTION_SIZE) < + round_up(base, SECTION_SIZE) + SECTION_SIZE) + return apply_to_page_range(mm, base, size, set_permissions, md); + + return 0; +} + int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) { struct map_desc desc = { @@ -34,5 +69,11 @@ int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) desc.type = MT_DEVICE; create_mapping_late(mm, &desc, true); + + /* + * If stricter permissions were specified, apply them now. + */ + if (md->attribute & (EFI_MEMORY_RO | EFI_MEMORY_XP)) + return efi_set_mapping_permissions(mm, md); return 0; } diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 2c4bea39cf22..7d4e2850910c 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -883,7 +883,8 @@ static void __init request_standard_resources(const struct machine_desc *mdesc) request_resource(&ioport_resource, &lp2); } -#if defined(CONFIG_VGA_CONSOLE) || defined(CONFIG_DUMMY_CONSOLE) +#if defined(CONFIG_VGA_CONSOLE) || defined(CONFIG_DUMMY_CONSOLE) || \ + defined(CONFIG_EFI) struct screen_info screen_info = { .orig_video_lines = 30, .orig_video_cols = 80, diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms index efa77c146415..521b1ec59157 100644 --- a/arch/arm64/Kconfig.platforms +++ b/arch/arm64/Kconfig.platforms @@ -2,6 +2,7 @@ menu "Platform selection" config ARCH_SUNXI bool "Allwinner sunxi 64-bit SoC Family" + select GENERIC_IRQ_CHIP help This enables support for Allwinner sunxi based SoCs like the A64. diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h index 8e88a696c9cb..622db3c6474e 100644 --- a/arch/arm64/include/asm/efi.h +++ b/arch/arm64/include/asm/efi.h @@ -4,6 +4,7 @@ #include <asm/io.h> #include <asm/mmu_context.h> #include <asm/neon.h> +#include <asm/ptrace.h> #include <asm/tlbflush.h> #ifdef CONFIG_EFI @@ -14,32 +15,29 @@ extern void efi_init(void); int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md); -#define efi_call_virt(f, ...) \ +#define efi_set_mapping_permissions efi_create_mapping + +#define arch_efi_call_virt_setup() \ ({ \ - efi_##f##_t *__f; \ - efi_status_t __s; \ - \ kernel_neon_begin(); \ efi_virtmap_load(); \ - __f = efi.systab->runtime->f; \ - __s = __f(__VA_ARGS__); \ - efi_virtmap_unload(); \ - kernel_neon_end(); \ - __s; \ }) -#define __efi_call_virt(f, ...) \ +#define arch_efi_call_virt(f, args...) \ ({ \ efi_##f##_t *__f; \ - \ - kernel_neon_begin(); \ - efi_virtmap_load(); \ __f = efi.systab->runtime->f; \ - __f(__VA_ARGS__); \ + __f(args); \ +}) + +#define arch_efi_call_virt_teardown() \ +({ \ efi_virtmap_unload(); \ kernel_neon_end(); \ }) +#define ARCH_EFI_IRQ_FLAGS_MASK (PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT) + /* arch specific definitions used by the stub code */ /* @@ -50,7 +48,16 @@ int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md); #define EFI_FDT_ALIGN SZ_2M /* used by allocate_new_fdt_and_exit_boot() */ #define MAX_FDT_OFFSET SZ_512M -#define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__) +#define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__) +#define __efi_call_early(f, ...) f(__VA_ARGS__) +#define efi_is_64bit() (true) + +#define alloc_screen_info(x...) &screen_info +#define free_screen_info(x...) + +static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt) +{ +} #define EFI_ALLOC_ALIGN SZ_64K diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index b6abc852f2a1..78f52488f9ff 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -17,22 +17,51 @@ #include <asm/efi.h> -int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) +/* + * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be + * executable, everything else can be mapped with the XN bits + * set. Also take the new (optional) RO/XP bits into account. + */ +static __init pteval_t create_mapping_protection(efi_memory_desc_t *md) { - pteval_t prot_val; + u64 attr = md->attribute; + u32 type = md->type; - /* - * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be - * executable, everything else can be mapped with the XN bits - * set. - */ - if ((md->attribute & EFI_MEMORY_WB) == 0) - prot_val = PROT_DEVICE_nGnRE; - else if (md->type == EFI_RUNTIME_SERVICES_CODE || - !PAGE_ALIGNED(md->phys_addr)) - prot_val = pgprot_val(PAGE_KERNEL_EXEC); - else - prot_val = pgprot_val(PAGE_KERNEL); + if (type == EFI_MEMORY_MAPPED_IO) + return PROT_DEVICE_nGnRE; + + if (WARN_ONCE(!PAGE_ALIGNED(md->phys_addr), + "UEFI Runtime regions are not aligned to 64 KB -- buggy firmware?")) + /* + * If the region is not aligned to the page size of the OS, we + * can not use strict permissions, since that would also affect + * the mapping attributes of the adjacent regions. + */ + return pgprot_val(PAGE_KERNEL_EXEC); + + /* R-- */ + if ((attr & (EFI_MEMORY_XP | EFI_MEMORY_RO)) == + (EFI_MEMORY_XP | EFI_MEMORY_RO)) + return pgprot_val(PAGE_KERNEL_RO); + + /* R-X */ + if (attr & EFI_MEMORY_RO) + return pgprot_val(PAGE_KERNEL_ROX); + + /* RW- */ + if (attr & EFI_MEMORY_XP || type != EFI_RUNTIME_SERVICES_CODE) + return pgprot_val(PAGE_KERNEL); + + /* RWX */ + return pgprot_val(PAGE_KERNEL_EXEC); +} + +/* we will fill this structure from the stub, so don't put it in .bss */ +struct screen_info screen_info __section(.data); + +int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) +{ + pteval_t prot_val = create_mapping_protection(md); create_pgd_mapping(mm, md->phys_addr, md->virt_addr, md->num_pages << EFI_PAGE_SHIFT, diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h index 5e360ce88f10..1428849aece8 100644 --- a/arch/arm64/kernel/image.h +++ b/arch/arm64/kernel/image.h @@ -112,6 +112,7 @@ __efistub___memset = KALLSYMS_HIDE(__pi_memset); __efistub__text = KALLSYMS_HIDE(_text); __efistub__end = KALLSYMS_HIDE(_end); __efistub__edata = KALLSYMS_HIDE(_edata); +__efistub_screen_info = KALLSYMS_HIDE(screen_info); #endif diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index a34420a5df9a..b405bbb54431 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -476,6 +476,7 @@ emit_cond_jmp: case BPF_JGE: jmp_cond = A64_COND_CS; break; + case BPF_JSET: case BPF_JNE: jmp_cond = A64_COND_NE; break; diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h index ce112472bdd6..8b23e070b844 100644 --- a/arch/ia64/include/asm/rwsem.h +++ b/arch/ia64/include/asm/rwsem.h @@ -49,8 +49,8 @@ __down_read (struct rw_semaphore *sem) /* * lock for writing */ -static inline void -__down_write (struct rw_semaphore *sem) +static inline long +___down_write (struct rw_semaphore *sem) { long old, new; @@ -59,10 +59,26 @@ __down_write (struct rw_semaphore *sem) new = old + RWSEM_ACTIVE_WRITE_BIAS; } while (cmpxchg_acq(&sem->count, old, new) != old); - if (old != 0) + return old; +} + +static inline void +__down_write (struct rw_semaphore *sem) +{ + if (___down_write(sem)) rwsem_down_write_failed(sem); } +static inline int +__down_write_killable (struct rw_semaphore *sem) +{ + if (___down_write(sem)) + if (IS_ERR(rwsem_down_write_failed_killable(sem))) + return -EINTR; + + return 0; +} + /* * unlock after reading */ diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index 300dac3702f1..bf0865cd438a 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -531,8 +531,6 @@ efi_init (void) efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff, vendor); - set_bit(EFI_SYSTEM_TABLES, &efi.flags); - palo_phys = EFI_INVALID_TABLE_ADDR; if (efi_config_init(arch_tables) != 0) diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h index fead491dfc28..c75e4471e618 100644 --- a/arch/s390/include/asm/rwsem.h +++ b/arch/s390/include/asm/rwsem.h @@ -90,7 +90,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +static inline long ___down_write(struct rw_semaphore *sem) { signed long old, new, tmp; @@ -104,13 +104,23 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) : "=&d" (old), "=&d" (new), "=Q" (sem->count) : "Q" (sem->count), "m" (tmp) : "cc", "memory"); - if (old != 0) - rwsem_down_write_failed(sem); + + return old; } static inline void __down_write(struct rw_semaphore *sem) { - __down_write_nested(sem, 0); + if (___down_write(sem)) + rwsem_down_write_failed(sem); +} + +static inline int __down_write_killable(struct rw_semaphore *sem) +{ + if (___down_write(sem)) + if (IS_ERR(rwsem_down_write_failed_killable(sem))) + return -EINTR; + + return 0; } /* diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild index a319745a7b63..751c3373a92c 100644 --- a/arch/sh/include/asm/Kbuild +++ b/arch/sh/include/asm/Kbuild @@ -26,6 +26,7 @@ generic-y += percpu.h generic-y += poll.h generic-y += preempt.h generic-y += resource.h +generic-y += rwsem.h generic-y += sembuf.h generic-y += serial.h generic-y += shmbuf.h diff --git a/arch/sh/include/asm/rwsem.h b/arch/sh/include/asm/rwsem.h deleted file mode 100644 index edab57265293..000000000000 --- a/arch/sh/include/asm/rwsem.h +++ /dev/null @@ -1,132 +0,0 @@ -/* - * include/asm-sh/rwsem.h: R/W semaphores for SH using the stuff - * in lib/rwsem.c. - */ - -#ifndef _ASM_SH_RWSEM_H -#define _ASM_SH_RWSEM_H - -#ifndef _LINUX_RWSEM_H -#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead" -#endif - -#ifdef __KERNEL__ - -#define RWSEM_UNLOCKED_VALUE 0x00000000 -#define RWSEM_ACTIVE_BIAS 0x00000001 -#define RWSEM_ACTIVE_MASK 0x0000ffff -#define RWSEM_WAITING_BIAS (-0x00010000) -#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS -#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - -/* - * lock for reading - */ -static inline void __down_read(struct rw_semaphore *sem) -{ - if (atomic_inc_return((atomic_t *)(&sem->count)) > 0) - smp_wmb(); - else - rwsem_down_read_failed(sem); -} - -static inline int __down_read_trylock(struct rw_semaphore *sem) -{ - int tmp; - - while ((tmp = sem->count) >= 0) { - if (tmp == cmpxchg(&sem->count, tmp, - tmp + RWSEM_ACTIVE_READ_BIAS)) { - smp_wmb(); - return 1; - } - } - return 0; -} - -/* - * lock for writing - */ -static inline void __down_write(struct rw_semaphore *sem) -{ - int tmp; - - tmp = atomic_add_return(RWSEM_ACTIVE_WRITE_BIAS, - (atomic_t *)(&sem->count)); - if (tmp == RWSEM_ACTIVE_WRITE_BIAS) - smp_wmb(); - else - rwsem_down_write_failed(sem); -} - -static inline int __down_write_trylock(struct rw_semaphore *sem) -{ - int tmp; - - tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); - smp_wmb(); - return tmp == RWSEM_UNLOCKED_VALUE; -} - -/* - * unlock after reading - */ -static inline void __up_read(struct rw_semaphore *sem) -{ - int tmp; - - smp_wmb(); - tmp = atomic_dec_return((atomic_t *)(&sem->count)); - if (tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0) - rwsem_wake(sem); -} - -/* - * unlock after writing - */ -static inline void __up_write(struct rw_semaphore *sem) -{ - smp_wmb(); - if (atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS, - (atomic_t *)(&sem->count)) < 0) - rwsem_wake(sem); -} - -/* - * implement atomic add functionality - */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) -{ - atomic_add(delta, (atomic_t *)(&sem->count)); -} - -/* - * downgrade write lock to read lock - */ -static inline void __downgrade_write(struct rw_semaphore *sem) -{ - int tmp; - - smp_wmb(); - tmp = atomic_add_return(-RWSEM_WAITING_BIAS, (atomic_t *)(&sem->count)); - if (tmp < 0) - rwsem_downgrade_wake(sem); -} - -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) -{ - __down_write(sem); -} - -/* - * implement exchange and add functionality - */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) -{ - smp_mb(); - return atomic_add_return(delta, (atomic_t *)(&sem->count)); -} - -#endif /* __KERNEL__ */ -#endif /* _ASM_SH_RWSEM_H */ diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild index e928618838bc..6024c26c0585 100644 --- a/arch/sparc/include/asm/Kbuild +++ b/arch/sparc/include/asm/Kbuild @@ -16,6 +16,7 @@ generic-y += mm-arch-hooks.h generic-y += module.h generic-y += mutex.h generic-y += preempt.h +generic-y += rwsem.h generic-y += serial.h generic-y += trace_clock.h generic-y += types.h diff --git a/arch/sparc/include/asm/rwsem.h b/arch/sparc/include/asm/rwsem.h deleted file mode 100644 index 069bf4d663a1..000000000000 --- a/arch/sparc/include/asm/rwsem.h +++ /dev/null @@ -1,124 +0,0 @@ -/* - * rwsem.h: R/W semaphores implemented using CAS - * - * Written by David S. Miller (davem@redhat.com), 2001. - * Derived from asm-i386/rwsem.h - */ -#ifndef _SPARC64_RWSEM_H -#define _SPARC64_RWSEM_H - -#ifndef _LINUX_RWSEM_H -#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead" -#endif - -#ifdef __KERNEL__ - -#define RWSEM_UNLOCKED_VALUE 0x00000000L -#define RWSEM_ACTIVE_BIAS 0x00000001L -#define RWSEM_ACTIVE_MASK 0xffffffffL -#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1) -#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS -#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - -/* - * lock for reading - */ -static inline void __down_read(struct rw_semaphore *sem) -{ - if (unlikely(atomic64_inc_return((atomic64_t *)(&sem->count)) <= 0L)) - rwsem_down_read_failed(sem); -} - -static inline int __down_read_trylock(struct rw_semaphore *sem) -{ - long tmp; - - while ((tmp = sem->count) >= 0L) { - if (tmp == cmpxchg(&sem->count, tmp, - tmp + RWSEM_ACTIVE_READ_BIAS)) { - return 1; - } - } - return 0; -} - -/* - * lock for writing - */ -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) -{ - long tmp; - - tmp = atomic64_add_return(RWSEM_ACTIVE_WRITE_BIAS, - (atomic64_t *)(&sem->count)); - if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) - rwsem_down_write_failed(sem); -} - -static inline void __down_write(struct rw_semaphore *sem) -{ - __down_write_nested(sem, 0); -} - -static inline int __down_write_trylock(struct rw_semaphore *sem) -{ - long tmp; - - tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); - return tmp == RWSEM_UNLOCKED_VALUE; -} - -/* - * unlock after reading - */ -static inline void __up_read(struct rw_semaphore *sem) -{ - long tmp; - - tmp = atomic64_dec_return((atomic64_t *)(&sem->count)); - if (unlikely(tmp < -1L && (tmp & RWSEM_ACTIVE_MASK) == 0L)) - rwsem_wake(sem); -} - -/* - * unlock after writing - */ -static inline void __up_write(struct rw_semaphore *sem) -{ - if (unlikely(atomic64_sub_return(RWSEM_ACTIVE_WRITE_BIAS, - (atomic64_t *)(&sem->count)) < 0L)) - rwsem_wake(sem); -} - -/* - * implement atomic add functionality - */ -static inline void rwsem_atomic_add(long delta, struct rw_semaphore *sem) -{ - atomic64_add(delta, (atomic64_t *)(&sem->count)); -} - -/* - * downgrade write lock to read lock - */ -static inline void __downgrade_write(struct rw_semaphore *sem) -{ - long tmp; - - tmp = atomic64_add_return(-RWSEM_WAITING_BIAS, (atomic64_t *)(&sem->count)); - if (tmp < 0L) - rwsem_downgrade_wake(sem); -} - -/* - * implement exchange and add functionality - */ -static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem) -{ - return atomic64_add_return(delta, (atomic64_t *)(&sem->count)); -} - -#endif /* __KERNEL__ */ - -#endif /* _SPARC64_RWSEM_H */ diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 583d539a4197..52fef606bc54 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -571,312 +571,6 @@ free_handle: efi_call_early(free_pool, pci_handle); } -static void -setup_pixel_info(struct screen_info *si, u32 pixels_per_scan_line, - struct efi_pixel_bitmask pixel_info, int pixel_format) -{ - if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) { - si->lfb_depth = 32; - si->lfb_linelength = pixels_per_scan_line * 4; - si->red_size = 8; - si->red_pos = 0; - si->green_size = 8; - si->green_pos = 8; - si->blue_size = 8; - si->blue_pos = 16; - si->rsvd_size = 8; - si->rsvd_pos = 24; - } else if (pixel_format == PIXEL_BGR_RESERVED_8BIT_PER_COLOR) { - si->lfb_depth = 32; - si->lfb_linelength = pixels_per_scan_line * 4; - si->red_size = 8; - si->red_pos = 16; - si->green_size = 8; - si->green_pos = 8; - si->blue_size = 8; - si->blue_pos = 0; - si->rsvd_size = 8; - si->rsvd_pos = 24; - } else if (pixel_format == PIXEL_BIT_MASK) { - find_bits(pixel_info.red_mask, &si->red_pos, &si->red_size); - find_bits(pixel_info.green_mask, &si->green_pos, - &si->green_size); - find_bits(pixel_info.blue_mask, &si->blue_pos, &si->blue_size); - find_bits(pixel_info.reserved_mask, &si->rsvd_pos, - &si->rsvd_size); - si->lfb_depth = si->red_size + si->green_size + - si->blue_size + si->rsvd_size; - si->lfb_linelength = (pixels_per_scan_line * si->lfb_depth) / 8; - } else { - si->lfb_depth = 4; - si->lfb_linelength = si->lfb_width / 2; - si->red_size = 0; - si->red_pos = 0; - si->green_size = 0; - si->green_pos = 0; - si->blue_size = 0; - si->blue_pos = 0; - si->rsvd_size = 0; - si->rsvd_pos = 0; - } -} - -static efi_status_t -__gop_query32(struct efi_graphics_output_protocol_32 *gop32, - struct efi_graphics_output_mode_info **info, - unsigned long *size, u64 *fb_base) -{ - struct efi_graphics_output_protocol_mode_32 *mode; - efi_status_t status; - unsigned long m; - - m = gop32->mode; - mode = (struct efi_graphics_output_protocol_mode_32 *)m; - - status = efi_early->call(gop32->query_mode, gop32, - mode->mode, size, info); - if (status != EFI_SUCCESS) - return status; - - *fb_base = mode->frame_buffer_base; - return status; -} - -static efi_status_t -setup_gop32(struct screen_info *si, efi_guid_t *proto, - unsigned long size, void **gop_handle) -{ - struct efi_graphics_output_protocol_32 *gop32, *first_gop; - unsigned long nr_gops; - u16 width, height; - u32 pixels_per_scan_line; - u32 ext_lfb_base; - u64 fb_base; - struct efi_pixel_bitmask pixel_info; - int pixel_format; - efi_status_t status; - u32 *handles = (u32 *)(unsigned long)gop_handle; - int i; - - first_gop = NULL; - gop32 = NULL; - - nr_gops = size / sizeof(u32); - for (i = 0; i < nr_gops; i++) { - struct efi_graphics_output_mode_info *info = NULL; - efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID; - bool conout_found = false; - void *dummy = NULL; - u32 h = handles[i]; - u64 current_fb_base; - - status = efi_call_early(handle_protocol, h, - proto, (void **)&gop32); - if (status != EFI_SUCCESS) - continue; - - status = efi_call_early(handle_protocol, h, - &conout_proto, &dummy); - if (status == EFI_SUCCESS) - conout_found = true; - - status = __gop_query32(gop32, &info, &size, ¤t_fb_base); - if (status == EFI_SUCCESS && (!first_gop || conout_found)) { - /* - * Systems that use the UEFI Console Splitter may - * provide multiple GOP devices, not all of which are - * backed by real hardware. The workaround is to search - * for a GOP implementing the ConOut protocol, and if - * one isn't found, to just fall back to the first GOP. - */ - width = info->horizontal_resolution; - height = info->vertical_resolution; - pixel_format = info->pixel_format; - pixel_info = info->pixel_information; - pixels_per_scan_line = info->pixels_per_scan_line; - fb_base = current_fb_base; - - /* - * Once we've found a GOP supporting ConOut, - * don't bother looking any further. - */ - first_gop = gop32; - if (conout_found) - break; - } - } - - /* Did we find any GOPs? */ - if (!first_gop) - goto out; - - /* EFI framebuffer */ - si->orig_video_isVGA = VIDEO_TYPE_EFI; - - si->lfb_width = width; - si->lfb_height = height; - si->lfb_base = fb_base; - - ext_lfb_base = (u64)(unsigned long)fb_base >> 32; - if (ext_lfb_base) { - si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE; - si->ext_lfb_base = ext_lfb_base; - } - - si->pages = 1; - - setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format); - - si->lfb_size = si->lfb_linelength * si->lfb_height; - - si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS; -out: - return status; -} - -static efi_status_t -__gop_query64(struct efi_graphics_output_protocol_64 *gop64, - struct efi_graphics_output_mode_info **info, - unsigned long *size, u64 *fb_base) -{ - struct efi_graphics_output_protocol_mode_64 *mode; - efi_status_t status; - unsigned long m; - - m = gop64->mode; - mode = (struct efi_graphics_output_protocol_mode_64 *)m; - - status = efi_early->call(gop64->query_mode, gop64, - mode->mode, size, info); - if (status != EFI_SUCCESS) - return status; - - *fb_base = mode->frame_buffer_base; - return status; -} - -static efi_status_t -setup_gop64(struct screen_info *si, efi_guid_t *proto, - unsigned long size, void **gop_handle) -{ - struct efi_graphics_output_protocol_64 *gop64, *first_gop; - unsigned long nr_gops; - u16 width, height; - u32 pixels_per_scan_line; - u32 ext_lfb_base; - u64 fb_base; - struct efi_pixel_bitmask pixel_info; - int pixel_format; - efi_status_t status; - u64 *handles = (u64 *)(unsigned long)gop_handle; - int i; - - first_gop = NULL; - gop64 = NULL; - - nr_gops = size / sizeof(u64); - for (i = 0; i < nr_gops; i++) { - struct efi_graphics_output_mode_info *info = NULL; - efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID; - bool conout_found = false; - void *dummy = NULL; - u64 h = handles[i]; - u64 current_fb_base; - - status = efi_call_early(handle_protocol, h, - proto, (void **)&gop64); - if (status != EFI_SUCCESS) - continue; - - status = efi_call_early(handle_protocol, h, - &conout_proto, &dummy); - if (status == EFI_SUCCESS) - conout_found = true; - - status = __gop_query64(gop64, &info, &size, ¤t_fb_base); - if (status == EFI_SUCCESS && (!first_gop || conout_found)) { - /* - * Systems that use the UEFI Console Splitter may - * provide multiple GOP devices, not all of which are - * backed by real hardware. The workaround is to search - * for a GOP implementing the ConOut protocol, and if - * one isn't found, to just fall back to the first GOP. - */ - width = info->horizontal_resolution; - height = info->vertical_resolution; - pixel_format = info->pixel_format; - pixel_info = info->pixel_information; - pixels_per_scan_line = info->pixels_per_scan_line; - fb_base = current_fb_base; - - /* - * Once we've found a GOP supporting ConOut, - * don't bother looking any further. - */ - first_gop = gop64; - if (conout_found) - break; - } - } - - /* Did we find any GOPs? */ - if (!first_gop) - goto out; - - /* EFI framebuffer */ - si->orig_video_isVGA = VIDEO_TYPE_EFI; - - si->lfb_width = width; - si->lfb_height = height; - si->lfb_base = fb_base; - - ext_lfb_base = (u64)(unsigned long)fb_base >> 32; - if (ext_lfb_base) { - si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE; - si->ext_lfb_base = ext_lfb_base; - } - - si->pages = 1; - - setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format); - - si->lfb_size = si->lfb_linelength * si->lfb_height; - - si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS; -out: - return status; -} - -/* - * See if we have Graphics Output Protocol - */ -static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto, - unsigned long size) -{ - efi_status_t status; - void **gop_handle = NULL; - - status = efi_call_early(allocate_pool, EFI_LOADER_DATA, - size, (void **)&gop_handle); - if (status != EFI_SUCCESS) - return status; - - status = efi_call_early(locate_handle, - EFI_LOCATE_BY_PROTOCOL, - proto, NULL, &size, gop_handle); - if (status != EFI_SUCCESS) - goto free_handle; - - if (efi_early->is64) - status = setup_gop64(si, proto, size, gop_handle); - else - status = setup_gop32(si, proto, size, gop_handle); - -free_handle: - efi_call_early(free_pool, gop_handle); - return status; -} - static efi_status_t setup_uga32(void **uga_handle, unsigned long size, u32 *width, u32 *height) { @@ -1038,7 +732,7 @@ void setup_graphics(struct boot_params *boot_params) EFI_LOCATE_BY_PROTOCOL, &graphics_proto, NULL, &size, gop_handle); if (status == EFI_BUFFER_TOO_SMALL) - status = setup_gop(si, &graphics_proto, size); + status = efi_setup_gop(NULL, si, &graphics_proto, size); if (status != EFI_SUCCESS) { size = 0; diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h index d487e727f1ec..c0223f1a89d7 100644 --- a/arch/x86/boot/compressed/eboot.h +++ b/arch/x86/boot/compressed/eboot.h @@ -11,80 +11,6 @@ #define DESC_TYPE_CODE_DATA (1 << 0) -#define EFI_CONSOLE_OUT_DEVICE_GUID \ - EFI_GUID(0xd3b36f2c, 0xd551, 0x11d4, 0x9a, 0x46, 0x0, 0x90, 0x27, \ - 0x3f, 0xc1, 0x4d) - -#define PIXEL_RGB_RESERVED_8BIT_PER_COLOR 0 -#define PIXEL_BGR_RESERVED_8BIT_PER_COLOR 1 -#define PIXEL_BIT_MASK 2 -#define PIXEL_BLT_ONLY 3 -#define PIXEL_FORMAT_MAX 4 - -struct efi_pixel_bitmask { - u32 red_mask; - u32 green_mask; - u32 blue_mask; - u32 reserved_mask; -}; - -struct efi_graphics_output_mode_info { - u32 version; - u32 horizontal_resolution; - u32 vertical_resolution; - int pixel_format; - struct efi_pixel_bitmask pixel_information; - u32 pixels_per_scan_line; -} __packed; - -struct efi_graphics_output_protocol_mode_32 { - u32 max_mode; - u32 mode; - u32 info; - u32 size_of_info; - u64 frame_buffer_base; - u32 frame_buffer_size; -} __packed; - -struct efi_graphics_output_protocol_mode_64 { - u32 max_mode; - u32 mode; - u64 info; - u64 size_of_info; - u64 frame_buffer_base; - u64 frame_buffer_size; -} __packed; - -struct efi_graphics_output_protocol_mode { - u32 max_mode; - u32 mode; - unsigned long info; - unsigned long size_of_info; - u64 frame_buffer_base; - unsigned long frame_buffer_size; -} __packed; - -struct efi_graphics_output_protocol_32 { - u32 query_mode; - u32 set_mode; - u32 blt; - u32 mode; -}; - -struct efi_graphics_output_protocol_64 { - u64 query_mode; - u64 set_mode; - u64 blt; - u64 mode; -}; - -struct efi_graphics_output_protocol { - void *query_mode; - unsigned long set_mode; - unsigned long blt; - struct efi_graphics_output_protocol_mode *mode; -}; - struct efi_uga_draw_protocol_32 { u32 get_mode; u32 set_mode; diff --git a/arch/x86/configs/kvm_guest.config b/arch/x86/configs/kvm_guest.config index f9affcc3b9f1..9906505c998a 100644 --- a/arch/x86/configs/kvm_guest.config +++ b/arch/x86/configs/kvm_guest.config @@ -26,3 +26,6 @@ CONFIG_VIRTIO_NET=y CONFIG_9P_FS=y CONFIG_NET_9P=y CONFIG_NET_9P_VIRTIO=y +CONFIG_SCSI_LOWLEVEL=y +CONFIG_SCSI_VIRTIO=y +CONFIG_VIRTIO_INPUT=y diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index b30dd8154cc2..4cddd17153fb 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -384,5 +384,5 @@ 375 i386 membarrier sys_membarrier 376 i386 mlock2 sys_mlock2 377 i386 copy_file_range sys_copy_file_range -378 i386 preadv2 sys_preadv2 -379 i386 pwritev2 sys_pwritev2 +378 i386 preadv2 sys_preadv2 compat_sys_preadv2 +379 i386 pwritev2 sys_pwritev2 compat_sys_pwritev2 diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 53748c45e488..78d1e7467eae 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -3,6 +3,7 @@ #include <asm/fpu/api.h> #include <asm/pgtable.h> +#include <asm/processor-flags.h> #include <asm/tlb.h> /* @@ -28,33 +29,22 @@ #define MAX_CMDLINE_ADDRESS UINT_MAX -#ifdef CONFIG_X86_32 +#define ARCH_EFI_IRQ_FLAGS_MASK X86_EFLAGS_IF +#ifdef CONFIG_X86_32 extern unsigned long asmlinkage efi_call_phys(void *, ...); +#define arch_efi_call_virt_setup() kernel_fpu_begin() +#define arch_efi_call_virt_teardown() kernel_fpu_end() + /* * Wrap all the virtual calls in a way that forces the parameters on the stack. */ - -/* Use this macro if your virtual returns a non-void value */ -#define efi_call_virt(f, args...) \ +#define arch_efi_call_virt(f, args...) \ ({ \ - efi_status_t __s; \ - kernel_fpu_begin(); \ - __s = ((efi_##f##_t __attribute__((regparm(0)))*) \ - efi.systab->runtime->f)(args); \ - kernel_fpu_end(); \ - __s; \ -}) - -/* Use this macro if your virtual call does not return any value */ -#define __efi_call_virt(f, args...) \ -({ \ - kernel_fpu_begin(); \ ((efi_##f##_t __attribute__((regparm(0)))*) \ efi.systab->runtime->f)(args); \ - kernel_fpu_end(); \ }) #define efi_ioremap(addr, size, type, attr) ioremap_cache(addr, size) @@ -78,10 +68,8 @@ struct efi_scratch { u64 phys_stack; } __packed; -#define efi_call_virt(f, ...) \ +#define arch_efi_call_virt_setup() \ ({ \ - efi_status_t __s; \ - \ efi_sync_low_kernel_mappings(); \ preempt_disable(); \ __kernel_fpu_begin(); \ @@ -91,9 +79,13 @@ struct efi_scratch { write_cr3((unsigned long)efi_scratch.efi_pgt); \ __flush_tlb_all(); \ } \ - \ - __s = efi_call((void *)efi.systab->runtime->f, __VA_ARGS__); \ - \ +}) + +#define arch_efi_call_virt(f, args...) \ + efi_call((void *)efi.systab->runtime->f, args) \ + +#define arch_efi_call_virt_teardown() \ +({ \ if (efi_scratch.use_pgd) { \ write_cr3(efi_scratch.prev_cr3); \ __flush_tlb_all(); \ @@ -101,15 +93,8 @@ struct efi_scratch { \ __kernel_fpu_end(); \ preempt_enable(); \ - __s; \ }) -/* - * All X86_64 virt calls return non-void values. Thus, use non-void call for - * virt calls that would be void on X86_32. - */ -#define __efi_call_virt(f, args...) efi_call_virt(f, args) - extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size, u32 type, u64 attribute); @@ -180,6 +165,8 @@ static inline bool efi_runtime_supported(void) extern struct console early_efi_console; extern void parse_efi_setup(u64 phys_addr, u32 data_len); +extern void efifb_setup_from_dmi(struct screen_info *si, const char *opt); + #ifdef CONFIG_EFI_MIXED extern void efi_thunk_runtime_setup(void); extern efi_status_t efi_thunk_set_virtual_address_map( @@ -225,6 +212,11 @@ __pure const struct efi_config *__efi_early(void); #define efi_call_early(f, ...) \ __efi_early()->call(__efi_early()->f, __VA_ARGS__); +#define __efi_call_early(f, ...) \ + __efi_early()->call((unsigned long)f, __VA_ARGS__); + +#define efi_is_64bit() __efi_early()->is64 + extern bool efi_reboot_required(void); #else diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index ceec86eb68e9..453744c1d347 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -99,26 +99,36 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +#define ____down_write(sem, slow_path) \ +({ \ + long tmp; \ + struct rw_semaphore* ret; \ + asm volatile("# beginning down_write\n\t" \ + LOCK_PREFIX " xadd %1,(%3)\n\t" \ + /* adds 0xffff0001, returns the old value */ \ + " test " __ASM_SEL(%w1,%k1) "," __ASM_SEL(%w1,%k1) "\n\t" \ + /* was the active mask 0 before? */\ + " jz 1f\n" \ + " call " slow_path "\n" \ + "1:\n" \ + "# ending down_write" \ + : "+m" (sem->count), "=d" (tmp), "=a" (ret) \ + : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS) \ + : "memory", "cc"); \ + ret; \ +}) + +static inline void __down_write(struct rw_semaphore *sem) { - long tmp; - asm volatile("# beginning down_write\n\t" - LOCK_PREFIX " xadd %1,(%2)\n\t" - /* adds 0xffff0001, returns the old value */ - " test " __ASM_SEL(%w1,%k1) "," __ASM_SEL(%w1,%k1) "\n\t" - /* was the active mask 0 before? */ - " jz 1f\n" - " call call_rwsem_down_write_failed\n" - "1:\n" - "# ending down_write" - : "+m" (sem->count), "=d" (tmp) - : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS) - : "memory", "cc"); + ____down_write(sem, "call_rwsem_down_write_failed"); } -static inline void __down_write(struct rw_semaphore *sem) +static inline int __down_write_killable(struct rw_semaphore *sem) { - __down_write_nested(sem, 0); + if (IS_ERR(____down_write(sem, "call_rwsem_down_write_failed_killable"))) + return -EINTR; + + return 0; } /* diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index a969ae607be8..2e7513d1f1f4 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -108,6 +108,14 @@ struct exception_table_entry { #define ARCH_HAS_RELATIVE_EXTABLE +#define swap_ex_entry_fixup(a, b, tmp, delta) \ + do { \ + (a)->fixup = (b)->fixup + (delta); \ + (b)->fixup = (tmp).fixup - (delta); \ + (a)->handler = (b)->handler + (delta); \ + (b)->handler = (tmp).handler - (delta); \ + } while (0) + extern int fixup_exception(struct pt_regs *regs, int trapnr); extern bool ex_has_fault_handler(unsigned long ip); extern int early_fixup_exception(unsigned long *ip); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index ab0adc0fa5db..a9b31eb815f2 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -535,6 +535,15 @@ static void native_machine_emergency_restart(void) mode = reboot_mode == REBOOT_WARM ? 0x1234 : 0; *((unsigned short *)__va(0x472)) = mode; + /* + * If an EFI capsule has been registered with the firmware then + * override the reboot= parameter. + */ + if (efi_capsule_pending(NULL)) { + pr_info("EFI capsule is pending, forcing EFI reboot.\n"); + reboot_type = BOOT_EFI; + } + for (;;) { /* Could also try the reset bit in the Hammer NB */ switch (reboot_type) { diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 548ddf7d6fd2..3e84ef16f657 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -248,18 +248,17 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, if (config_enabled(CONFIG_X86_64)) sp -= 128; - if (!onsigstack) { - /* This is the X/Open sanctioned signal stack switching. */ - if (ka->sa.sa_flags & SA_ONSTACK) { - if (current->sas_ss_size) - sp = current->sas_ss_sp + current->sas_ss_size; - } else if (config_enabled(CONFIG_X86_32) && - (regs->ss & 0xffff) != __USER_DS && - !(ka->sa.sa_flags & SA_RESTORER) && - ka->sa.sa_restorer) { - /* This is the legacy signal stack switching. */ - sp = (unsigned long) ka->sa.sa_restorer; - } + /* This is the X/Open sanctioned signal stack switching. */ + if (ka->sa.sa_flags & SA_ONSTACK) { + if (sas_ss_flags(sp) == 0) + sp = current->sas_ss_sp + current->sas_ss_size; + } else if (config_enabled(CONFIG_X86_32) && + !onsigstack && + (regs->ss & 0xffff) != __USER_DS && + !(ka->sa.sa_flags & SA_RESTORER) && + ka->sa.sa_restorer) { + /* This is the legacy signal stack switching. */ + sp = (unsigned long) ka->sa.sa_restorer; } if (fpu->fpstate_active) { diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c index 5da924bbf0a0..623965e86b65 100644 --- a/arch/x86/kernel/sysfb_efi.c +++ b/arch/x86/kernel/sysfb_efi.c @@ -68,6 +68,21 @@ struct efifb_dmi_info efifb_dmi_list[] = { [M_UNKNOWN] = { NULL, 0, 0, 0, 0, OVERRIDE_NONE } }; +void efifb_setup_from_dmi(struct screen_info *si, const char *opt) +{ + int i; + + for (i = 0; i < M_UNKNOWN; i++) { + if (efifb_dmi_list[i].base != 0 && + !strcmp(opt, efifb_dmi_list[i].optname)) { + si->lfb_base = efifb_dmi_list[i].base; + si->lfb_linelength = efifb_dmi_list[i].stride; + si->lfb_width = efifb_dmi_list[i].width; + si->lfb_height = efifb_dmi_list[i].height; + } + } +} + #define choose_value(dmivalue, fwvalue, field, flags) ({ \ typeof(fwvalue) _ret_ = fwvalue; \ if ((flags) & (field)) \ diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0f6294376fbd..a2f24af3c999 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -5110,13 +5110,17 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) { + register void *__sp asm(_ASM_SP); ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; + if (!(ctxt->d & ByteOp)) fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; + asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags), - [fastop]"+S"(fop) + [fastop]"+S"(fop), "+r"(__sp) : "c"(ctxt->src2.val)); + ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); if (!fop) /* exception is returned in fop variable */ return emulate_de(ctxt); diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S index be110efa0096..bf2c6074efd2 100644 --- a/arch/x86/lib/rwsem.S +++ b/arch/x86/lib/rwsem.S @@ -29,8 +29,10 @@ * there is contention on the semaphore. * * %eax contains the semaphore pointer on entry. Save the C-clobbered - * registers (%eax, %edx and %ecx) except %eax whish is either a return - * value or just clobbered.. + * registers (%eax, %edx and %ecx) except %eax which is either a return + * value or just gets clobbered. Same is true for %edx so make sure GCC + * reloads it after the slow path, by making it hold a temporary, for + * example see ____down_write(). */ #define save_common_regs \ @@ -106,6 +108,16 @@ ENTRY(call_rwsem_down_write_failed) ret ENDPROC(call_rwsem_down_write_failed) +ENTRY(call_rwsem_down_write_failed_killable) + FRAME_BEGIN + save_common_regs + movq %rax,%rdi + call rwsem_down_write_failed_killable + restore_common_regs + FRAME_END + ret +ENDPROC(call_rwsem_down_write_failed_killable) + ENTRY(call_rwsem_wake) FRAME_BEGIN /* do nothing if still outstanding active readers */ diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 01be9ec3bf79..a1f0e1d0ddc2 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1125,8 +1125,14 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, int primary) { - if (cpa->pgd) + if (cpa->pgd) { + /* + * Right now, we only execute this code path when mapping + * the EFI virtual memory map regions, no other users + * provide a ->pgd value. This may change in the future. + */ return populate_pgd(cpa, vaddr); + } /* * Ignore all non primary paths. diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 994a7df84a7b..f93545e7dc54 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -54,10 +54,6 @@ #include <asm/rtc.h> #include <asm/uv/uv.h> -#define EFI_DEBUG - -struct efi_memory_map memmap; - static struct efi efi_phys __initdata; static efi_system_table_t efi_systab __initdata; @@ -119,11 +115,10 @@ void efi_get_time(struct timespec *now) void __init efi_find_mirror(void) { - void *p; + efi_memory_desc_t *md; u64 mirror_size = 0, total_size = 0; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - efi_memory_desc_t *md = p; + for_each_efi_memory_desc(md) { unsigned long long start = md->phys_addr; unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; @@ -146,10 +141,9 @@ void __init efi_find_mirror(void) static void __init do_add_efi_memmap(void) { - void *p; + efi_memory_desc_t *md; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - efi_memory_desc_t *md = p; + for_each_efi_memory_desc(md) { unsigned long long start = md->phys_addr; unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; int e820_type; @@ -209,47 +203,47 @@ int __init efi_memblock_x86_reserve_range(void) #else pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32)); #endif - memmap.phys_map = pmap; - memmap.nr_map = e->efi_memmap_size / + efi.memmap.phys_map = pmap; + efi.memmap.nr_map = e->efi_memmap_size / e->efi_memdesc_size; - memmap.desc_size = e->efi_memdesc_size; - memmap.desc_version = e->efi_memdesc_version; + efi.memmap.desc_size = e->efi_memdesc_size; + efi.memmap.desc_version = e->efi_memdesc_version; - memblock_reserve(pmap, memmap.nr_map * memmap.desc_size); + WARN(efi.memmap.desc_version != 1, + "Unexpected EFI_MEMORY_DESCRIPTOR version %ld", + efi.memmap.desc_version); - efi.memmap = &memmap; + memblock_reserve(pmap, efi.memmap.nr_map * efi.memmap.desc_size); return 0; } void __init efi_print_memmap(void) { -#ifdef EFI_DEBUG efi_memory_desc_t *md; - void *p; - int i; + int i = 0; - for (p = memmap.map, i = 0; - p < memmap.map_end; - p += memmap.desc_size, i++) { + for_each_efi_memory_desc(md) { char buf[64]; - md = p; pr_info("mem%02u: %s range=[0x%016llx-0x%016llx] (%lluMB)\n", - i, efi_md_typeattr_format(buf, sizeof(buf), md), + i++, efi_md_typeattr_format(buf, sizeof(buf), md), md->phys_addr, md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1, (md->num_pages >> (20 - EFI_PAGE_SHIFT))); } -#endif /* EFI_DEBUG */ } void __init efi_unmap_memmap(void) { + unsigned long size; + clear_bit(EFI_MEMMAP, &efi.flags); - if (memmap.map) { - early_memunmap(memmap.map, memmap.nr_map * memmap.desc_size); - memmap.map = NULL; + + size = efi.memmap.nr_map * efi.memmap.desc_size; + if (efi.memmap.map) { + early_memunmap(efi.memmap.map, size); + efi.memmap.map = NULL; } } @@ -352,8 +346,6 @@ static int __init efi_systab_init(void *phys) efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff); - set_bit(EFI_SYSTEM_TABLES, &efi.flags); - return 0; } @@ -440,17 +432,22 @@ static int __init efi_runtime_init(void) static int __init efi_memmap_init(void) { + unsigned long addr, size; + if (efi_enabled(EFI_PARAVIRT)) return 0; /* Map the EFI memory map */ - memmap.map = early_memremap((unsigned long)memmap.phys_map, - memmap.nr_map * memmap.desc_size); - if (memmap.map == NULL) { + size = efi.memmap.nr_map * efi.memmap.desc_size; + addr = (unsigned long)efi.memmap.phys_map; + + efi.memmap.map = early_memremap(addr, size); + if (efi.memmap.map == NULL) { pr_err("Could not map the memory map!\n"); return -ENOMEM; } - memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); + + efi.memmap.map_end = efi.memmap.map + size; if (add_efi_memmap) do_add_efi_memmap(); @@ -552,12 +549,9 @@ void __init efi_set_executable(efi_memory_desc_t *md, bool executable) void __init runtime_code_page_mkexec(void) { efi_memory_desc_t *md; - void *p; /* Make EFI runtime service code area executable */ - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - + for_each_efi_memory_desc(md) { if (md->type != EFI_RUNTIME_SERVICES_CODE) continue; @@ -604,12 +598,10 @@ void __init old_map_region(efi_memory_desc_t *md) /* Merge contiguous regions of the same type and attribute */ static void __init efi_merge_regions(void) { - void *p; efi_memory_desc_t *md, *prev_md = NULL; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + for_each_efi_memory_desc(md) { u64 prev_size; - md = p; if (!prev_md) { prev_md = md; @@ -651,30 +643,31 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md) static void __init save_runtime_map(void) { #ifdef CONFIG_KEXEC_CORE + unsigned long desc_size; efi_memory_desc_t *md; - void *tmp, *p, *q = NULL; + void *tmp, *q = NULL; int count = 0; if (efi_enabled(EFI_OLD_MEMMAP)) return; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; + desc_size = efi.memmap.desc_size; + for_each_efi_memory_desc(md) { if (!(md->attribute & EFI_MEMORY_RUNTIME) || (md->type == EFI_BOOT_SERVICES_CODE) || (md->type == EFI_BOOT_SERVICES_DATA)) continue; - tmp = krealloc(q, (count + 1) * memmap.desc_size, GFP_KERNEL); + tmp = krealloc(q, (count + 1) * desc_size, GFP_KERNEL); if (!tmp) goto out; q = tmp; - memcpy(q + count * memmap.desc_size, md, memmap.desc_size); + memcpy(q + count * desc_size, md, desc_size); count++; } - efi_runtime_map_setup(q, count, memmap.desc_size); + efi_runtime_map_setup(q, count, desc_size); return; out: @@ -714,10 +707,10 @@ static inline void *efi_map_next_entry_reverse(void *entry) { /* Initial call */ if (!entry) - return memmap.map_end - memmap.desc_size; + return efi.memmap.map_end - efi.memmap.desc_size; - entry -= memmap.desc_size; - if (entry < memmap.map) + entry -= efi.memmap.desc_size; + if (entry < efi.memmap.map) return NULL; return entry; @@ -759,10 +752,10 @@ static void *efi_map_next_entry(void *entry) /* Initial call */ if (!entry) - return memmap.map; + return efi.memmap.map; - entry += memmap.desc_size; - if (entry >= memmap.map_end) + entry += efi.memmap.desc_size; + if (entry >= efi.memmap.map_end) return NULL; return entry; @@ -776,8 +769,11 @@ static void * __init efi_map_regions(int *count, int *pg_shift) { void *p, *new_memmap = NULL; unsigned long left = 0; + unsigned long desc_size; efi_memory_desc_t *md; + desc_size = efi.memmap.desc_size; + p = NULL; while ((p = efi_map_next_entry(p))) { md = p; @@ -792,7 +788,7 @@ static void * __init efi_map_regions(int *count, int *pg_shift) efi_map_region(md); get_systab_virt_addr(md); - if (left < memmap.desc_size) { + if (left < desc_size) { new_memmap = realloc_pages(new_memmap, *pg_shift); if (!new_memmap) return NULL; @@ -801,10 +797,9 @@ static void * __init efi_map_regions(int *count, int *pg_shift) (*pg_shift)++; } - memcpy(new_memmap + (*count * memmap.desc_size), md, - memmap.desc_size); + memcpy(new_memmap + (*count * desc_size), md, desc_size); - left -= memmap.desc_size; + left -= desc_size; (*count)++; } @@ -816,7 +811,6 @@ static void __init kexec_enter_virtual_mode(void) #ifdef CONFIG_KEXEC_CORE efi_memory_desc_t *md; unsigned int num_pages; - void *p; efi.systab = NULL; @@ -840,8 +834,7 @@ static void __init kexec_enter_virtual_mode(void) * Map efi regions which were passed via setup_data. The virt_addr is a * fixed addr which was used in first kernel of a kexec boot. */ - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; + for_each_efi_memory_desc(md) { efi_map_region_fixed(md); /* FIXME: add error handling */ get_systab_virt_addr(md); } @@ -850,10 +843,10 @@ static void __init kexec_enter_virtual_mode(void) BUG_ON(!efi.systab); - num_pages = ALIGN(memmap.nr_map * memmap.desc_size, PAGE_SIZE); + num_pages = ALIGN(efi.memmap.nr_map * efi.memmap.desc_size, PAGE_SIZE); num_pages >>= PAGE_SHIFT; - if (efi_setup_page_tables(memmap.phys_map, num_pages)) { + if (efi_setup_page_tables(efi.memmap.phys_map, num_pages)) { clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); return; } @@ -937,16 +930,16 @@ static void __init __efi_enter_virtual_mode(void) if (efi_is_native()) { status = phys_efi_set_virtual_address_map( - memmap.desc_size * count, - memmap.desc_size, - memmap.desc_version, + efi.memmap.desc_size * count, + efi.memmap.desc_size, + efi.memmap.desc_version, (efi_memory_desc_t *)__pa(new_memmap)); } else { status = efi_thunk_set_virtual_address_map( efi_phys.set_virtual_address_map, - memmap.desc_size * count, - memmap.desc_size, - memmap.desc_version, + efi.memmap.desc_size * count, + efi.memmap.desc_size, + efi.memmap.desc_version, (efi_memory_desc_t *)__pa(new_memmap)); } @@ -1011,13 +1004,11 @@ void __init efi_enter_virtual_mode(void) u32 efi_mem_type(unsigned long phys_addr) { efi_memory_desc_t *md; - void *p; if (!efi_enabled(EFI_MEMMAP)) return 0; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; + for_each_efi_memory_desc(md) { if ((md->phys_addr <= phys_addr) && (phys_addr < (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)))) diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 49e4dd4a1f58..6e7242be1c87 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -55,14 +55,12 @@ struct efi_scratch efi_scratch; static void __init early_code_mapping_set_exec(int executable) { efi_memory_desc_t *md; - void *p; if (!(__supported_pte_mask & _PAGE_NX)) return; /* Make EFI service code area executable */ - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; + for_each_efi_memory_desc(md) { if (md->type == EFI_RUNTIME_SERVICES_CODE || md->type == EFI_BOOT_SERVICES_CODE) efi_set_executable(md, executable); @@ -253,7 +251,7 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) * Map all of RAM so that we can access arguments in the 1:1 * mapping when making EFI runtime calls. */ - for_each_efi_memory_desc(&memmap, md) { + for_each_efi_memory_desc(md) { if (md->type != EFI_CONVENTIONAL_MEMORY && md->type != EFI_LOADER_DATA && md->type != EFI_LOADER_CODE) @@ -398,7 +396,6 @@ void __init efi_runtime_update_mappings(void) unsigned long pfn; pgd_t *pgd = efi_pgd; efi_memory_desc_t *md; - void *p; if (efi_enabled(EFI_OLD_MEMMAP)) { if (__supported_pte_mask & _PAGE_NX) @@ -409,9 +406,8 @@ void __init efi_runtime_update_mappings(void) if (!efi_enabled(EFI_NX_PE_DATA)) return; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + for_each_efi_memory_desc(md) { unsigned long pf = 0; - md = p; if (!(md->attribute & EFI_MEMORY_RUNTIME)) continue; diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index ab50ada1d56e..097cb09d917b 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -195,10 +195,9 @@ static bool can_free_region(u64 start, u64 size) */ void __init efi_reserve_boot_services(void) { - void *p; + efi_memory_desc_t *md; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - efi_memory_desc_t *md = p; + for_each_efi_memory_desc(md) { u64 start = md->phys_addr; u64 size = md->num_pages << EFI_PAGE_SHIFT; bool already_reserved; @@ -250,10 +249,9 @@ void __init efi_reserve_boot_services(void) void __init efi_free_boot_services(void) { - void *p; + efi_memory_desc_t *md; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - efi_memory_desc_t *md = p; + for_each_efi_memory_desc(md) { unsigned long long start = md->phys_addr; unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index b56855a1382a..28cf4c5d65ef 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -22,6 +22,7 @@ generic-y += mm-arch-hooks.h generic-y += percpu.h generic-y += preempt.h generic-y += resource.h +generic-y += rwsem.h generic-y += sections.h generic-y += siginfo.h generic-y += statfs.h diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h deleted file mode 100644 index 249619e7e7f2..000000000000 --- a/arch/xtensa/include/asm/rwsem.h +++ /dev/null @@ -1,131 +0,0 @@ -/* - * include/asm-xtensa/rwsem.h - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Largely copied from include/asm-ppc/rwsem.h - * - * Copyright (C) 2001 - 2005 Tensilica Inc. - */ - -#ifndef _XTENSA_RWSEM_H -#define _XTENSA_RWSEM_H - -#ifndef _LINUX_RWSEM_H -#error "Please don't include <asm/rwsem.h> directly, use <linux/rwsem.h> instead." -#endif - -#define RWSEM_UNLOCKED_VALUE 0x00000000 -#define RWSEM_ACTIVE_BIAS 0x00000001 -#define RWSEM_ACTIVE_MASK 0x0000ffff -#define RWSEM_WAITING_BIAS (-0x00010000) -#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS -#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - -/* - * lock for reading - */ -static inline void __down_read(struct rw_semaphore *sem) -{ - if (atomic_add_return(1,(atomic_t *)(&sem->count)) > 0) - smp_wmb(); - else - rwsem_down_read_failed(sem); -} - -static inline int __down_read_trylock(struct rw_semaphore *sem) -{ - int tmp; - - while ((tmp = sem->count) >= 0) { - if (tmp == cmpxchg(&sem->count, tmp, - tmp + RWSEM_ACTIVE_READ_BIAS)) { - smp_wmb(); - return 1; - } - } - return 0; -} - -/* - * lock for writing - */ -static inline void __down_write(struct rw_semaphore *sem) -{ - int tmp; - - tmp = atomic_add_return(RWSEM_ACTIVE_WRITE_BIAS, - (atomic_t *)(&sem->count)); - if (tmp == RWSEM_ACTIVE_WRITE_BIAS) - smp_wmb(); - else - rwsem_down_write_failed(sem); -} - -static inline int __down_write_trylock(struct rw_semaphore *sem) -{ - int tmp; - - tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); - smp_wmb(); - return tmp == RWSEM_UNLOCKED_VALUE; -} - -/* - * unlock after reading - */ -static inline void __up_read(struct rw_semaphore *sem) -{ - int tmp; - - smp_wmb(); - tmp = atomic_sub_return(1,(atomic_t *)(&sem->count)); - if (tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0) - rwsem_wake(sem); -} - -/* - * unlock after writing - */ -static inline void __up_write(struct rw_semaphore *sem) -{ - smp_wmb(); - if (atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS, - (atomic_t *)(&sem->count)) < 0) - rwsem_wake(sem); -} - -/* - * implement atomic add functionality - */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) -{ - atomic_add(delta, (atomic_t *)(&sem->count)); -} - -/* - * downgrade write lock to read lock - */ -static inline void __downgrade_write(struct rw_semaphore *sem) -{ - int tmp; - - smp_wmb(); - tmp = atomic_add_return(-RWSEM_WAITING_BIAS, (atomic_t *)(&sem->count)); - if (tmp < 0) - rwsem_downgrade_wake(sem); -} - -/* - * implement exchange and add functionality - */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) -{ - smp_mb(); - return atomic_add_return(delta, (atomic_t *)(&sem->count)); -} - -#endif /* _XTENSA_RWSEM_H */ diff --git a/block/blk-map.c b/block/blk-map.c index a54f0543b956..b9f88b7751fb 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -9,24 +9,6 @@ #include "blk.h" -static bool iovec_gap_to_prv(struct request_queue *q, - struct iovec *prv, struct iovec *cur) -{ - unsigned long prev_end; - - if (!queue_virt_boundary(q)) - return false; - - if (prv->iov_base == NULL && prv->iov_len == 0) - /* prv is not set - don't check */ - return false; - - prev_end = (unsigned long)(prv->iov_base + prv->iov_len); - - return (((unsigned long)cur->iov_base & queue_virt_boundary(q)) || - prev_end & queue_virt_boundary(q)); -} - int blk_rq_append_bio(struct request_queue *q, struct request *rq, struct bio *bio) { @@ -125,31 +107,18 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, struct rq_map_data *map_data, const struct iov_iter *iter, gfp_t gfp_mask) { - struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0}; - bool copy = (q->dma_pad_mask & iter->count) || map_data; + bool copy = false; + unsigned long align = q->dma_pad_mask | queue_dma_alignment(q); struct bio *bio = NULL; struct iov_iter i; int ret; - if (!iter || !iter->count) - return -EINVAL; - - iov_for_each(iov, i, *iter) { - unsigned long uaddr = (unsigned long) iov.iov_base; - - if (!iov.iov_len) - return -EINVAL; - - /* - * Keep going so we check length of all segments - */ - if ((uaddr & queue_dma_alignment(q)) || - iovec_gap_to_prv(q, &prv, &iov)) - copy = true; - - prv.iov_base = iov.iov_base; - prv.iov_len = iov.iov_len; - } + if (map_data) + copy = true; + else if (iov_iter_alignment(iter) & align) + copy = true; + else if (queue_virt_boundary(q)) + copy = queue_virt_boundary(q) & iov_iter_gap_alignment(iter); i = *iter; do { diff --git a/crypto/testmgr.c b/crypto/testmgr.c index b86883aedca1..7d4acc449233 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -1776,6 +1776,7 @@ static int alg_test_drbg(const struct alg_test_desc *desc, const char *driver, static int do_test_rsa(struct crypto_akcipher *tfm, struct akcipher_testvec *vecs) { + char *xbuf[XBUFSIZE]; struct akcipher_request *req; void *outbuf_enc = NULL; void *outbuf_dec = NULL; @@ -1784,9 +1785,12 @@ static int do_test_rsa(struct crypto_akcipher *tfm, int err = -ENOMEM; struct scatterlist src, dst, src_tab[2]; + if (testmgr_alloc_buf(xbuf)) + return err; + req = akcipher_request_alloc(tfm, GFP_KERNEL); if (!req) - return err; + goto free_xbuf; init_completion(&result.completion); @@ -1804,9 +1808,14 @@ static int do_test_rsa(struct crypto_akcipher *tfm, if (!outbuf_enc) goto free_req; + if (WARN_ON(vecs->m_size > PAGE_SIZE)) + goto free_all; + + memcpy(xbuf[0], vecs->m, vecs->m_size); + sg_init_table(src_tab, 2); - sg_set_buf(&src_tab[0], vecs->m, 8); - sg_set_buf(&src_tab[1], vecs->m + 8, vecs->m_size - 8); + sg_set_buf(&src_tab[0], xbuf[0], 8); + sg_set_buf(&src_tab[1], xbuf[0] + 8, vecs->m_size - 8); sg_init_one(&dst, outbuf_enc, out_len_max); akcipher_request_set_crypt(req, src_tab, &dst, vecs->m_size, out_len_max); @@ -1825,7 +1834,7 @@ static int do_test_rsa(struct crypto_akcipher *tfm, goto free_all; } /* verify that encrypted message is equal to expected */ - if (memcmp(vecs->c, sg_virt(req->dst), vecs->c_size)) { + if (memcmp(vecs->c, outbuf_enc, vecs->c_size)) { pr_err("alg: rsa: encrypt test failed. Invalid output\n"); err = -EINVAL; goto free_all; @@ -1840,7 +1849,13 @@ static int do_test_rsa(struct crypto_akcipher *tfm, err = -ENOMEM; goto free_all; } - sg_init_one(&src, vecs->c, vecs->c_size); + + if (WARN_ON(vecs->c_size > PAGE_SIZE)) + goto free_all; + + memcpy(xbuf[0], vecs->c, vecs->c_size); + + sg_init_one(&src, xbuf[0], vecs->c_size); sg_init_one(&dst, outbuf_dec, out_len_max); init_completion(&result.completion); akcipher_request_set_crypt(req, &src, &dst, vecs->c_size, out_len_max); @@ -1867,6 +1882,8 @@ free_all: kfree(outbuf_enc); free_req: akcipher_request_free(req); +free_xbuf: + testmgr_free_buf(xbuf); return err; } diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h index 5c79526245c2..a0380338946a 100644 --- a/drivers/base/regmap/internal.h +++ b/drivers/base/regmap/internal.h @@ -13,6 +13,7 @@ #ifndef _REGMAP_INTERNAL_H #define _REGMAP_INTERNAL_H +#include <linux/device.h> #include <linux/regmap.h> #include <linux/fs.h> #include <linux/list.h> diff --git a/drivers/base/regmap/regmap-mmio.c b/drivers/base/regmap/regmap-mmio.c index 7526906ca080..5189fd6182f6 100644 --- a/drivers/base/regmap/regmap-mmio.c +++ b/drivers/base/regmap/regmap-mmio.c @@ -23,6 +23,8 @@ #include <linux/regmap.h> #include <linux/slab.h> +#include "internal.h" + struct regmap_mmio_context { void __iomem *regs; unsigned val_bytes; @@ -212,6 +214,7 @@ static const struct regmap_bus regmap_mmio = { .reg_write = regmap_mmio_write, .reg_read = regmap_mmio_read, .free_context = regmap_mmio_free_context, + .val_format_endian_default = REGMAP_ENDIAN_LITTLE, }; static struct regmap_mmio_context *regmap_mmio_gen_context(struct device *dev, @@ -245,7 +248,7 @@ static struct regmap_mmio_context *regmap_mmio_gen_context(struct device *dev, ctx->val_bytes = config->val_bits / 8; ctx->clk = ERR_PTR(-ENODEV); - switch (config->reg_format_endian) { + switch (regmap_get_val_endian(dev, ®map_mmio, config)) { case REGMAP_ENDIAN_DEFAULT: case REGMAP_ENDIAN_LITTLE: #ifdef __LITTLE_ENDIAN diff --git a/drivers/base/regmap/regmap-spmi.c b/drivers/base/regmap/regmap-spmi.c index 7e58f6560399..4a36e415e938 100644 --- a/drivers/base/regmap/regmap-spmi.c +++ b/drivers/base/regmap/regmap-spmi.c @@ -142,7 +142,7 @@ static int regmap_spmi_ext_read(void *context, while (val_size) { len = min_t(size_t, val_size, 8); - err = spmi_ext_register_readl(context, addr, val, val_size); + err = spmi_ext_register_readl(context, addr, val, len); if (err) goto err_out; diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index e1670d533f97..6394152f648f 100644 --- a/drivers/firmware/efi/Kconfig +++ b/drivers/firmware/efi/Kconfig @@ -87,6 +87,31 @@ config EFI_RUNTIME_WRAPPERS config EFI_ARMSTUB bool +config EFI_BOOTLOADER_CONTROL + tristate "EFI Bootloader Control" + depends on EFI_VARS + default n + ---help--- + This module installs a reboot hook, such that if reboot() is + invoked with a string argument NNN, "NNN" is copied to the + "LoaderEntryOneShot" EFI variable, to be read by the + bootloader. If the string matches one of the boot labels + defined in its configuration, the bootloader will boot once + to that label. The "LoaderEntryRebootReason" EFI variable is + set with the reboot reason: "reboot" or "shutdown". The + bootloader reads this reboot reason and takes particular + action according to its policy. + +config EFI_CAPSULE_LOADER + tristate "EFI capsule loader" + depends on EFI + help + This option exposes a loader interface "/dev/efi_capsule_loader" for + users to load EFI capsules. This driver requires working runtime + capsule support in the firmware, which many OEMs do not provide. + + Most users should say N. + endmenu config UEFI_CPER diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile index 62e654f255f4..a219640f881f 100644 --- a/drivers/firmware/efi/Makefile +++ b/drivers/firmware/efi/Makefile @@ -9,7 +9,8 @@ # KASAN_SANITIZE_runtime-wrappers.o := n -obj-$(CONFIG_EFI) += efi.o vars.o reboot.o +obj-$(CONFIG_EFI) += efi.o vars.o reboot.o memattr.o +obj-$(CONFIG_EFI) += capsule.o obj-$(CONFIG_EFI_VARS) += efivars.o obj-$(CONFIG_EFI_ESRT) += esrt.o obj-$(CONFIG_EFI_VARS_PSTORE) += efi-pstore.o @@ -18,7 +19,9 @@ obj-$(CONFIG_EFI_RUNTIME_MAP) += runtime-map.o obj-$(CONFIG_EFI_RUNTIME_WRAPPERS) += runtime-wrappers.o obj-$(CONFIG_EFI_STUB) += libstub/ obj-$(CONFIG_EFI_FAKE_MEMMAP) += fake_mem.o +obj-$(CONFIG_EFI_BOOTLOADER_CONTROL) += efibc.o arm-obj-$(CONFIG_EFI) := arm-init.o arm-runtime.o obj-$(CONFIG_ARM) += $(arm-obj-y) obj-$(CONFIG_ARM64) += $(arm-obj-y) +obj-$(CONFIG_EFI_CAPSULE_LOADER) += capsule-loader.o diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c index 8714f8c271ba..ef90f0c4b70a 100644 --- a/drivers/firmware/efi/arm-init.c +++ b/drivers/firmware/efi/arm-init.c @@ -11,17 +11,19 @@ * */ +#define pr_fmt(fmt) "efi: " fmt + #include <linux/efi.h> #include <linux/init.h> #include <linux/memblock.h> #include <linux/mm_types.h> #include <linux/of.h> #include <linux/of_fdt.h> +#include <linux/platform_device.h> +#include <linux/screen_info.h> #include <asm/efi.h> -struct efi_memory_map memmap; - u64 efi_system_table; static int __init is_normal_ram(efi_memory_desc_t *md) @@ -40,7 +42,7 @@ static phys_addr_t efi_to_phys(unsigned long addr) { efi_memory_desc_t *md; - for_each_efi_memory_desc(&memmap, md) { + for_each_efi_memory_desc(md) { if (!(md->attribute & EFI_MEMORY_RUNTIME)) continue; if (md->virt_addr == 0) @@ -53,6 +55,36 @@ static phys_addr_t efi_to_phys(unsigned long addr) return addr; } +static __initdata unsigned long screen_info_table = EFI_INVALID_TABLE_ADDR; + +static __initdata efi_config_table_type_t arch_tables[] = { + {LINUX_EFI_ARM_SCREEN_INFO_TABLE_GUID, NULL, &screen_info_table}, + {NULL_GUID, NULL, NULL} +}; + +static void __init init_screen_info(void) +{ + struct screen_info *si; + + if (screen_info_table != EFI_INVALID_TABLE_ADDR) { + si = early_memremap_ro(screen_info_table, sizeof(*si)); + if (!si) { + pr_err("Could not map screen_info config table\n"); + return; + } + screen_info = *si; + early_memunmap(si, sizeof(*si)); + + /* dummycon on ARM needs non-zero values for columns/lines */ + screen_info.orig_video_cols = 80; + screen_info.orig_video_lines = 25; + } + + if (screen_info.orig_video_isVGA == VIDEO_TYPE_EFI && + memblock_is_map_memory(screen_info.lfb_base)) + memblock_mark_nomap(screen_info.lfb_base, screen_info.lfb_size); +} + static int __init uefi_init(void) { efi_char16_t *c16; @@ -85,6 +117,8 @@ static int __init uefi_init(void) efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff); + efi.runtime_version = efi.systab->hdr.revision; + /* Show what we know for posterity */ c16 = early_memremap_ro(efi_to_phys(efi.systab->fw_vendor), sizeof(vendor) * sizeof(efi_char16_t)); @@ -108,7 +142,8 @@ static int __init uefi_init(void) goto out; } retval = efi_config_parse_tables(config_tables, efi.systab->nr_tables, - sizeof(efi_config_table_t), NULL); + sizeof(efi_config_table_t), + arch_tables); early_memunmap(config_tables, table_size); out: @@ -143,7 +178,7 @@ static __init void reserve_regions(void) if (efi_enabled(EFI_DBG)) pr_info("Processing EFI memory map:\n"); - for_each_efi_memory_desc(&memmap, md) { + for_each_efi_memory_desc(md) { paddr = md->phys_addr; npages = md->num_pages; @@ -184,9 +219,9 @@ void __init efi_init(void) efi_system_table = params.system_table; - memmap.phys_map = params.mmap; - memmap.map = early_memremap_ro(params.mmap, params.mmap_size); - if (memmap.map == NULL) { + efi.memmap.phys_map = params.mmap; + efi.memmap.map = early_memremap_ro(params.mmap, params.mmap_size); + if (efi.memmap.map == NULL) { /* * If we are booting via UEFI, the UEFI memory map is the only * description of memory we have, so there is little point in @@ -194,28 +229,37 @@ void __init efi_init(void) */ panic("Unable to map EFI memory map.\n"); } - memmap.map_end = memmap.map + params.mmap_size; - memmap.desc_size = params.desc_size; - memmap.desc_version = params.desc_ver; + efi.memmap.map_end = efi.memmap.map + params.mmap_size; + efi.memmap.desc_size = params.desc_size; + efi.memmap.desc_version = params.desc_ver; + + WARN(efi.memmap.desc_version != 1, + "Unexpected EFI_MEMORY_DESCRIPTOR version %ld", + efi.memmap.desc_version); if (uefi_init() < 0) return; reserve_regions(); - early_memunmap(memmap.map, params.mmap_size); + efi_memattr_init(); + early_memunmap(efi.memmap.map, params.mmap_size); - if (IS_ENABLED(CONFIG_ARM)) { - /* - * ARM currently does not allow ioremap_cache() to be called on - * memory regions that are covered by struct page. So remove the - * UEFI memory map from the linear mapping. - */ - memblock_mark_nomap(params.mmap & PAGE_MASK, - PAGE_ALIGN(params.mmap_size + - (params.mmap & ~PAGE_MASK))); - } else { - memblock_reserve(params.mmap & PAGE_MASK, - PAGE_ALIGN(params.mmap_size + - (params.mmap & ~PAGE_MASK))); - } + memblock_reserve(params.mmap & PAGE_MASK, + PAGE_ALIGN(params.mmap_size + + (params.mmap & ~PAGE_MASK))); + + init_screen_info(); +} + +static int __init register_gop_device(void) +{ + void *pd; + + if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI) + return 0; + + pd = platform_device_register_data(NULL, "efi-framebuffer", 0, + &screen_info, sizeof(screen_info)); + return PTR_ERR_OR_ZERO(pd); } +subsys_initcall(register_gop_device); diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c index 6ae21e41a429..17ccf0a8787a 100644 --- a/drivers/firmware/efi/arm-runtime.c +++ b/drivers/firmware/efi/arm-runtime.c @@ -42,11 +42,13 @@ static struct mm_struct efi_mm = { static bool __init efi_virtmap_init(void) { efi_memory_desc_t *md; + bool systab_found; efi_mm.pgd = pgd_alloc(&efi_mm); init_new_context(NULL, &efi_mm); - for_each_efi_memory_desc(&memmap, md) { + systab_found = false; + for_each_efi_memory_desc(md) { phys_addr_t phys = md->phys_addr; int ret; @@ -64,7 +66,25 @@ static bool __init efi_virtmap_init(void) &phys, ret); return false; } + /* + * If this entry covers the address of the UEFI system table, + * calculate and record its virtual address. + */ + if (efi_system_table >= phys && + efi_system_table < phys + (md->num_pages * EFI_PAGE_SIZE)) { + efi.systab = (void *)(unsigned long)(efi_system_table - + phys + md->virt_addr); + systab_found = true; + } + } + if (!systab_found) { + pr_err("No virtual mapping found for the UEFI System Table\n"); + return false; } + + if (efi_memattr_apply_permissions(&efi_mm, efi_set_mapping_permissions)) + return false; + return true; } @@ -89,26 +109,17 @@ static int __init arm_enable_runtime_services(void) pr_info("Remapping and enabling EFI services.\n"); - mapsize = memmap.map_end - memmap.map; - memmap.map = (__force void *)ioremap_cache(memmap.phys_map, - mapsize); - if (!memmap.map) { - pr_err("Failed to remap EFI memory map\n"); - return -ENOMEM; - } - memmap.map_end = memmap.map + mapsize; - efi.memmap = &memmap; + mapsize = efi.memmap.map_end - efi.memmap.map; - efi.systab = (__force void *)ioremap_cache(efi_system_table, - sizeof(efi_system_table_t)); - if (!efi.systab) { - pr_err("Failed to remap EFI System Table\n"); + efi.memmap.map = memremap(efi.memmap.phys_map, mapsize, MEMREMAP_WB); + if (!efi.memmap.map) { + pr_err("Failed to remap EFI memory map\n"); return -ENOMEM; } - set_bit(EFI_SYSTEM_TABLES, &efi.flags); + efi.memmap.map_end = efi.memmap.map + mapsize; if (!efi_virtmap_init()) { - pr_err("No UEFI virtual mapping was installed -- runtime services will not be available\n"); + pr_err("UEFI virtual mapping missing or invalid -- runtime services will not be available\n"); return -ENOMEM; } @@ -116,8 +127,6 @@ static int __init arm_enable_runtime_services(void) efi_native_runtime_setup(); set_bit(EFI_RUNTIME_SERVICES, &efi.flags); - efi.runtime_version = efi.systab->hdr.revision; - return 0; } early_initcall(arm_enable_runtime_services); diff --git a/drivers/firmware/efi/capsule-loader.c b/drivers/firmware/efi/capsule-loader.c new file mode 100644 index 000000000000..c99c24bc79b0 --- /dev/null +++ b/drivers/firmware/efi/capsule-loader.c @@ -0,0 +1,343 @@ +/* + * EFI capsule loader driver. + * + * Copyright 2015 Intel Corporation + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + */ + +#define pr_fmt(fmt) "efi: " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/miscdevice.h> +#include <linux/highmem.h> +#include <linux/slab.h> +#include <linux/mutex.h> +#include <linux/efi.h> + +#define NO_FURTHER_WRITE_ACTION -1 + +struct capsule_info { + bool header_obtained; + int reset_type; + long index; + size_t count; + size_t total_size; + struct page **pages; + size_t page_bytes_remain; +}; + +/** + * efi_free_all_buff_pages - free all previous allocated buffer pages + * @cap_info: pointer to current instance of capsule_info structure + * + * In addition to freeing buffer pages, it flags NO_FURTHER_WRITE_ACTION + * to cease processing data in subsequent write(2) calls until close(2) + * is called. + **/ +static void efi_free_all_buff_pages(struct capsule_info *cap_info) +{ + while (cap_info->index > 0) + __free_page(cap_info->pages[--cap_info->index]); + + cap_info->index = NO_FURTHER_WRITE_ACTION; +} + +/** + * efi_capsule_setup_info - obtain the efi capsule header in the binary and + * setup capsule_info structure + * @cap_info: pointer to current instance of capsule_info structure + * @kbuff: a mapped first page buffer pointer + * @hdr_bytes: the total received number of bytes for efi header + **/ +static ssize_t efi_capsule_setup_info(struct capsule_info *cap_info, + void *kbuff, size_t hdr_bytes) +{ + efi_capsule_header_t *cap_hdr; + size_t pages_needed; + int ret; + void *temp_page; + + /* Only process data block that is larger than efi header size */ + if (hdr_bytes < sizeof(efi_capsule_header_t)) + return 0; + + /* Reset back to the correct offset of header */ + cap_hdr = kbuff - cap_info->count; + pages_needed = ALIGN(cap_hdr->imagesize, PAGE_SIZE) >> PAGE_SHIFT; + + if (pages_needed == 0) { + pr_err("%s: pages count invalid\n", __func__); + return -EINVAL; + } + + /* Check if the capsule binary supported */ + ret = efi_capsule_supported(cap_hdr->guid, cap_hdr->flags, + cap_hdr->imagesize, + &cap_info->reset_type); + if (ret) { + pr_err("%s: efi_capsule_supported() failed\n", + __func__); + return ret; + } + + cap_info->total_size = cap_hdr->imagesize; + temp_page = krealloc(cap_info->pages, + pages_needed * sizeof(void *), + GFP_KERNEL | __GFP_ZERO); + if (!temp_page) { + pr_debug("%s: krealloc() failed\n", __func__); + return -ENOMEM; + } + + cap_info->pages = temp_page; + cap_info->header_obtained = true; + + return 0; +} + +/** + * efi_capsule_submit_update - invoke the efi_capsule_update API once binary + * upload done + * @cap_info: pointer to current instance of capsule_info structure + **/ +static ssize_t efi_capsule_submit_update(struct capsule_info *cap_info) +{ + int ret; + void *cap_hdr_temp; + + cap_hdr_temp = kmap(cap_info->pages[0]); + if (!cap_hdr_temp) { + pr_debug("%s: kmap() failed\n", __func__); + return -EFAULT; + } + + ret = efi_capsule_update(cap_hdr_temp, cap_info->pages); + kunmap(cap_info->pages[0]); + if (ret) { + pr_err("%s: efi_capsule_update() failed\n", __func__); + return ret; + } + + /* Indicate capsule binary uploading is done */ + cap_info->index = NO_FURTHER_WRITE_ACTION; + pr_info("%s: Successfully upload capsule file with reboot type '%s'\n", + __func__, !cap_info->reset_type ? "RESET_COLD" : + cap_info->reset_type == 1 ? "RESET_WARM" : + "RESET_SHUTDOWN"); + return 0; +} + +/** + * efi_capsule_write - store the capsule binary and pass it to + * efi_capsule_update() API + * @file: file pointer + * @buff: buffer pointer + * @count: number of bytes in @buff + * @offp: not used + * + * Expectation: + * - A user space tool should start at the beginning of capsule binary and + * pass data in sequentially. + * - Users should close and re-open this file note in order to upload more + * capsules. + * - After an error returned, user should close the file and restart the + * operation for the next try otherwise -EIO will be returned until the + * file is closed. + * - An EFI capsule header must be located at the beginning of capsule + * binary file and passed in as first block data of write operation. + **/ +static ssize_t efi_capsule_write(struct file *file, const char __user *buff, + size_t count, loff_t *offp) +{ + int ret = 0; + struct capsule_info *cap_info = file->private_data; + struct page *page; + void *kbuff = NULL; + size_t write_byte; + + if (count == 0) + return 0; + + /* Return error while NO_FURTHER_WRITE_ACTION is flagged */ + if (cap_info->index < 0) + return -EIO; + + /* Only alloc a new page when previous page is full */ + if (!cap_info->page_bytes_remain) { + page = alloc_page(GFP_KERNEL); + if (!page) { + pr_debug("%s: alloc_page() failed\n", __func__); + ret = -ENOMEM; + goto failed; + } + + cap_info->pages[cap_info->index++] = page; + cap_info->page_bytes_remain = PAGE_SIZE; + } + + page = cap_info->pages[cap_info->index - 1]; + + kbuff = kmap(page); + if (!kbuff) { + pr_debug("%s: kmap() failed\n", __func__); + ret = -EFAULT; + goto failed; + } + kbuff += PAGE_SIZE - cap_info->page_bytes_remain; + + /* Copy capsule binary data from user space to kernel space buffer */ + write_byte = min_t(size_t, count, cap_info->page_bytes_remain); + if (copy_from_user(kbuff, buff, write_byte)) { + pr_debug("%s: copy_from_user() failed\n", __func__); + ret = -EFAULT; + goto fail_unmap; + } + cap_info->page_bytes_remain -= write_byte; + + /* Setup capsule binary info structure */ + if (!cap_info->header_obtained) { + ret = efi_capsule_setup_info(cap_info, kbuff, + cap_info->count + write_byte); + if (ret) + goto fail_unmap; + } + + cap_info->count += write_byte; + kunmap(page); + + /* Submit the full binary to efi_capsule_update() API */ + if (cap_info->header_obtained && + cap_info->count >= cap_info->total_size) { + if (cap_info->count > cap_info->total_size) { + pr_err("%s: upload size exceeded header defined size\n", + __func__); + ret = -EINVAL; + goto failed; + } + + ret = efi_capsule_submit_update(cap_info); + if (ret) + goto failed; + } + + return write_byte; + +fail_unmap: + kunmap(page); +failed: + efi_free_all_buff_pages(cap_info); + return ret; +} + +/** + * efi_capsule_flush - called by file close or file flush + * @file: file pointer + * @id: not used + * + * If a capsule is being partially uploaded then calling this function + * will be treated as upload termination and will free those completed + * buffer pages and -ECANCELED will be returned. + **/ +static int efi_capsule_flush(struct file *file, fl_owner_t id) +{ + int ret = 0; + struct capsule_info *cap_info = file->private_data; + + if (cap_info->index > 0) { + pr_err("%s: capsule upload not complete\n", __func__); + efi_free_all_buff_pages(cap_info); + ret = -ECANCELED; + } + + return ret; +} + +/** + * efi_capsule_release - called by file close + * @inode: not used + * @file: file pointer + * + * We will not free successfully submitted pages since efi update + * requires data to be maintained across system reboot. + **/ +static int efi_capsule_release(struct inode *inode, struct file *file) +{ + struct capsule_info *cap_info = file->private_data; + + kfree(cap_info->pages); + kfree(file->private_data); + file->private_data = NULL; + return 0; +} + +/** + * efi_capsule_open - called by file open + * @inode: not used + * @file: file pointer + * + * Will allocate each capsule_info memory for each file open call. + * This provided the capability to support multiple file open feature + * where user is not needed to wait for others to finish in order to + * upload their capsule binary. + **/ +static int efi_capsule_open(struct inode *inode, struct file *file) +{ + struct capsule_info *cap_info; + + cap_info = kzalloc(sizeof(*cap_info), GFP_KERNEL); + if (!cap_info) + return -ENOMEM; + + cap_info->pages = kzalloc(sizeof(void *), GFP_KERNEL); + if (!cap_info->pages) { + kfree(cap_info); + return -ENOMEM; + } + + file->private_data = cap_info; + + return 0; +} + +static const struct file_operations efi_capsule_fops = { + .owner = THIS_MODULE, + .open = efi_capsule_open, + .write = efi_capsule_write, + .flush = efi_capsule_flush, + .release = efi_capsule_release, + .llseek = no_llseek, +}; + +static struct miscdevice efi_capsule_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = "efi_capsule_loader", + .fops = &efi_capsule_fops, +}; + +static int __init efi_capsule_loader_init(void) +{ + int ret; + + if (!efi_enabled(EFI_RUNTIME_SERVICES)) + return -ENODEV; + + ret = misc_register(&efi_capsule_misc); + if (ret) + pr_err("%s: Failed to register misc char file note\n", + __func__); + + return ret; +} +module_init(efi_capsule_loader_init); + +static void __exit efi_capsule_loader_exit(void) +{ + misc_deregister(&efi_capsule_misc); +} +module_exit(efi_capsule_loader_exit); + +MODULE_DESCRIPTION("EFI capsule firmware binary loader"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/firmware/efi/capsule.c b/drivers/firmware/efi/capsule.c new file mode 100644 index 000000000000..53b9fd2293ee --- /dev/null +++ b/drivers/firmware/efi/capsule.c @@ -0,0 +1,308 @@ +/* + * EFI capsule support. + * + * Copyright 2013 Intel Corporation; author Matt Fleming + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + */ + +#define pr_fmt(fmt) "efi: " fmt + +#include <linux/slab.h> +#include <linux/mutex.h> +#include <linux/highmem.h> +#include <linux/efi.h> +#include <linux/vmalloc.h> +#include <asm/io.h> + +typedef struct { + u64 length; + u64 data; +} efi_capsule_block_desc_t; + +static bool capsule_pending; +static bool stop_capsules; +static int efi_reset_type = -1; + +/* + * capsule_mutex serialises access to both capsule_pending and + * efi_reset_type and stop_capsules. + */ +static DEFINE_MUTEX(capsule_mutex); + +/** + * efi_capsule_pending - has a capsule been passed to the firmware? + * @reset_type: store the type of EFI reset if capsule is pending + * + * To ensure that the registered capsule is processed correctly by the + * firmware we need to perform a specific type of reset. If a capsule is + * pending return the reset type in @reset_type. + * + * This function will race with callers of efi_capsule_update(), for + * example, calling this function while somebody else is in + * efi_capsule_update() but hasn't reached efi_capsue_update_locked() + * will miss the updates to capsule_pending and efi_reset_type after + * efi_capsule_update_locked() completes. + * + * A non-racy use is from platform reboot code because we use + * system_state to ensure no capsules can be sent to the firmware once + * we're at SYSTEM_RESTART. See efi_capsule_update_locked(). + */ +bool efi_capsule_pending(int *reset_type) +{ + if (!capsule_pending) + return false; + + if (reset_type) + *reset_type = efi_reset_type; + + return true; +} + +/* + * Whitelist of EFI capsule flags that we support. + * + * We do not handle EFI_CAPSULE_INITIATE_RESET because that would + * require us to prepare the kernel for reboot. Refuse to load any + * capsules with that flag and any other flags that we do not know how + * to handle. + */ +#define EFI_CAPSULE_SUPPORTED_FLAG_MASK \ + (EFI_CAPSULE_PERSIST_ACROSS_RESET | EFI_CAPSULE_POPULATE_SYSTEM_TABLE) + +/** + * efi_capsule_supported - does the firmware support the capsule? + * @guid: vendor guid of capsule + * @flags: capsule flags + * @size: size of capsule data + * @reset: the reset type required for this capsule + * + * Check whether a capsule with @flags is supported by the firmware + * and that @size doesn't exceed the maximum size for a capsule. + * + * No attempt is made to check @reset against the reset type required + * by any pending capsules because of the races involved. + */ +int efi_capsule_supported(efi_guid_t guid, u32 flags, size_t size, int *reset) +{ + efi_capsule_header_t capsule; + efi_capsule_header_t *cap_list[] = { &capsule }; + efi_status_t status; + u64 max_size; + + if (flags & ~EFI_CAPSULE_SUPPORTED_FLAG_MASK) + return -EINVAL; + + capsule.headersize = capsule.imagesize = sizeof(capsule); + memcpy(&capsule.guid, &guid, sizeof(efi_guid_t)); + capsule.flags = flags; + + status = efi.query_capsule_caps(cap_list, 1, &max_size, reset); + if (status != EFI_SUCCESS) + return efi_status_to_err(status); + + if (size > max_size) + return -ENOSPC; + + return 0; +} +EXPORT_SYMBOL_GPL(efi_capsule_supported); + +/* + * Every scatter gather list (block descriptor) page must end with a + * continuation pointer. The last continuation pointer of the last + * page must be zero to mark the end of the chain. + */ +#define SGLIST_PER_PAGE ((PAGE_SIZE / sizeof(efi_capsule_block_desc_t)) - 1) + +/* + * How many scatter gather list (block descriptor) pages do we need + * to map @count pages? + */ +static inline unsigned int sg_pages_num(unsigned int count) +{ + return DIV_ROUND_UP(count, SGLIST_PER_PAGE); +} + +/** + * efi_capsule_update_locked - pass a single capsule to the firmware + * @capsule: capsule to send to the firmware + * @sg_pages: array of scatter gather (block descriptor) pages + * @reset: the reset type required for @capsule + * + * Since this function must be called under capsule_mutex check + * whether efi_reset_type will conflict with @reset, and atomically + * set it and capsule_pending if a capsule was successfully sent to + * the firmware. + * + * We also check to see if the system is about to restart, and if so, + * abort. This avoids races between efi_capsule_update() and + * efi_capsule_pending(). + */ +static int +efi_capsule_update_locked(efi_capsule_header_t *capsule, + struct page **sg_pages, int reset) +{ + efi_physical_addr_t sglist_phys; + efi_status_t status; + + lockdep_assert_held(&capsule_mutex); + + /* + * If someone has already registered a capsule that requires a + * different reset type, we're out of luck and must abort. + */ + if (efi_reset_type >= 0 && efi_reset_type != reset) { + pr_err("Conflicting capsule reset type %d (%d).\n", + reset, efi_reset_type); + return -EINVAL; + } + + /* + * If the system is getting ready to restart it may have + * called efi_capsule_pending() to make decisions (such as + * whether to force an EFI reboot), and we're racing against + * that call. Abort in that case. + */ + if (unlikely(stop_capsules)) { + pr_warn("Capsule update raced with reboot, aborting.\n"); + return -EINVAL; + } + + sglist_phys = page_to_phys(sg_pages[0]); + + status = efi.update_capsule(&capsule, 1, sglist_phys); + if (status == EFI_SUCCESS) { + capsule_pending = true; + efi_reset_type = reset; + } + + return efi_status_to_err(status); +} + +/** + * efi_capsule_update - send a capsule to the firmware + * @capsule: capsule to send to firmware + * @pages: an array of capsule data pages + * + * Build a scatter gather list with EFI capsule block descriptors to + * map the capsule described by @capsule with its data in @pages and + * send it to the firmware via the UpdateCapsule() runtime service. + * + * @capsule must be a virtual mapping of the first page in @pages + * (@pages[0]) in the kernel address space. That is, a + * capsule_header_t that describes the entire contents of the capsule + * must be at the start of the first data page. + * + * Even though this function will validate that the firmware supports + * the capsule guid, users will likely want to check that + * efi_capsule_supported() returns true before calling this function + * because it makes it easier to print helpful error messages. + * + * If the capsule is successfully submitted to the firmware, any + * subsequent calls to efi_capsule_pending() will return true. @pages + * must not be released or modified if this function returns + * successfully. + * + * Callers must be prepared for this function to fail, which can + * happen if we raced with system reboot or if there is already a + * pending capsule that has a reset type that conflicts with the one + * required by @capsule. Do NOT use efi_capsule_pending() to detect + * this conflict since that would be racy. Instead, submit the capsule + * to efi_capsule_update() and check the return value. + * + * Return 0 on success, a converted EFI status code on failure. + */ +int efi_capsule_update(efi_capsule_header_t *capsule, struct page **pages) +{ + u32 imagesize = capsule->imagesize; + efi_guid_t guid = capsule->guid; + unsigned int count, sg_count; + u32 flags = capsule->flags; + struct page **sg_pages; + int rv, reset_type; + int i, j; + + rv = efi_capsule_supported(guid, flags, imagesize, &reset_type); + if (rv) + return rv; + + count = DIV_ROUND_UP(imagesize, PAGE_SIZE); + sg_count = sg_pages_num(count); + + sg_pages = kzalloc(sg_count * sizeof(*sg_pages), GFP_KERNEL); + if (!sg_pages) + return -ENOMEM; + + for (i = 0; i < sg_count; i++) { + sg_pages[i] = alloc_page(GFP_KERNEL); + if (!sg_pages[i]) { + rv = -ENOMEM; + goto out; + } + } + + for (i = 0; i < sg_count; i++) { + efi_capsule_block_desc_t *sglist; + + sglist = kmap(sg_pages[i]); + if (!sglist) { + rv = -ENOMEM; + goto out; + } + + for (j = 0; j < SGLIST_PER_PAGE && count > 0; j++) { + u64 sz = min_t(u64, imagesize, PAGE_SIZE); + + sglist[j].length = sz; + sglist[j].data = page_to_phys(*pages++); + + imagesize -= sz; + count--; + } + + /* Continuation pointer */ + sglist[j].length = 0; + + if (i + 1 == sg_count) + sglist[j].data = 0; + else + sglist[j].data = page_to_phys(sg_pages[i + 1]); + + kunmap(sg_pages[i]); + } + + mutex_lock(&capsule_mutex); + rv = efi_capsule_update_locked(capsule, sg_pages, reset_type); + mutex_unlock(&capsule_mutex); + +out: + for (i = 0; rv && i < sg_count; i++) { + if (sg_pages[i]) + __free_page(sg_pages[i]); + } + + kfree(sg_pages); + return rv; +} +EXPORT_SYMBOL_GPL(efi_capsule_update); + +static int capsule_reboot_notify(struct notifier_block *nb, unsigned long event, void *cmd) +{ + mutex_lock(&capsule_mutex); + stop_capsules = true; + mutex_unlock(&capsule_mutex); + + return NOTIFY_DONE; +} + +static struct notifier_block capsule_reboot_nb = { + .notifier_call = capsule_reboot_notify, +}; + +static int __init capsule_reboot_register(void) +{ + return register_reboot_notifier(&capsule_reboot_nb); +} +core_initcall(capsule_reboot_register); diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 3a69ed5ecfcb..05509f3aaee8 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -43,6 +43,7 @@ struct efi __read_mostly efi = { .config_table = EFI_INVALID_TABLE_ADDR, .esrt = EFI_INVALID_TABLE_ADDR, .properties_table = EFI_INVALID_TABLE_ADDR, + .mem_attr_table = EFI_INVALID_TABLE_ADDR, }; EXPORT_SYMBOL(efi); @@ -256,7 +257,7 @@ subsys_initcall(efisubsys_init); */ int __init efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md) { - struct efi_memory_map *map = efi.memmap; + struct efi_memory_map *map = &efi.memmap; phys_addr_t p, e; if (!efi_enabled(EFI_MEMMAP)) { @@ -338,6 +339,7 @@ static __initdata efi_config_table_type_t common_tables[] = { {UGA_IO_PROTOCOL_GUID, "UGA", &efi.uga}, {EFI_SYSTEM_RESOURCE_TABLE_GUID, "ESRT", &efi.esrt}, {EFI_PROPERTIES_TABLE_GUID, "PROP", &efi.properties_table}, + {EFI_MEMORY_ATTRIBUTES_TABLE_GUID, "MEMATTR", &efi.mem_attr_table}, {NULL_GUID, NULL, NULL}, }; @@ -351,8 +353,9 @@ static __init int match_config_table(efi_guid_t *guid, for (i = 0; efi_guidcmp(table_types[i].guid, NULL_GUID); i++) { if (!efi_guidcmp(*guid, table_types[i].guid)) { *(table_types[i].ptr) = table; - pr_cont(" %s=0x%lx ", - table_types[i].name, table); + if (table_types[i].name) + pr_cont(" %s=0x%lx ", + table_types[i].name, table); return 1; } } @@ -620,16 +623,12 @@ char * __init efi_md_typeattr_format(char *buf, size_t size, */ u64 __weak efi_mem_attributes(unsigned long phys_addr) { - struct efi_memory_map *map; efi_memory_desc_t *md; - void *p; if (!efi_enabled(EFI_MEMMAP)) return 0; - map = efi.memmap; - for (p = map->map; p < map->map_end; p += map->desc_size) { - md = p; + for_each_efi_memory_desc(md) { if ((md->phys_addr <= phys_addr) && (phys_addr < (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)))) @@ -637,3 +636,36 @@ u64 __weak efi_mem_attributes(unsigned long phys_addr) } return 0; } + +int efi_status_to_err(efi_status_t status) +{ + int err; + + switch (status) { + case EFI_SUCCESS: + err = 0; + break; + case EFI_INVALID_PARAMETER: + err = -EINVAL; + break; + case EFI_OUT_OF_RESOURCES: + err = -ENOSPC; + break; + case EFI_DEVICE_ERROR: + err = -EIO; + break; + case EFI_WRITE_PROTECTED: + err = -EROFS; + break; + case EFI_SECURITY_VIOLATION: + err = -EACCES; + break; + case EFI_NOT_FOUND: + err = -ENOENT; + break; + default: + err = -EINVAL; + } + + return err; +} diff --git a/drivers/firmware/efi/efibc.c b/drivers/firmware/efi/efibc.c new file mode 100644 index 000000000000..8dd0c7085e59 --- /dev/null +++ b/drivers/firmware/efi/efibc.c @@ -0,0 +1,113 @@ +/* + * efibc: control EFI bootloaders which obey LoaderEntryOneShot var + * Copyright (c) 2013-2016, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#define pr_fmt(fmt) "efibc: " fmt + +#include <linux/efi.h> +#include <linux/module.h> +#include <linux/reboot.h> +#include <linux/slab.h> + +static void efibc_str_to_str16(const char *str, efi_char16_t *str16) +{ + size_t i; + + for (i = 0; i < strlen(str); i++) + str16[i] = str[i]; + + str16[i] = '\0'; +} + +static int efibc_set_variable(const char *name, const char *value) +{ + int ret; + efi_guid_t guid = LINUX_EFI_LOADER_ENTRY_GUID; + struct efivar_entry *entry; + size_t size = (strlen(value) + 1) * sizeof(efi_char16_t); + + if (size > sizeof(entry->var.Data)) { + pr_err("value is too large"); + return -EINVAL; + } + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + pr_err("failed to allocate efivar entry"); + return -ENOMEM; + } + + efibc_str_to_str16(name, entry->var.VariableName); + efibc_str_to_str16(value, (efi_char16_t *)entry->var.Data); + memcpy(&entry->var.VendorGuid, &guid, sizeof(guid)); + + ret = efivar_entry_set(entry, + EFI_VARIABLE_NON_VOLATILE + | EFI_VARIABLE_BOOTSERVICE_ACCESS + | EFI_VARIABLE_RUNTIME_ACCESS, + size, entry->var.Data, NULL); + if (ret) + pr_err("failed to set %s EFI variable: 0x%x\n", + name, ret); + + kfree(entry); + return ret; +} + +static int efibc_reboot_notifier_call(struct notifier_block *notifier, + unsigned long event, void *data) +{ + const char *reason = "shutdown"; + int ret; + + if (event == SYS_RESTART) + reason = "reboot"; + + ret = efibc_set_variable("LoaderEntryRebootReason", reason); + if (ret || !data) + return NOTIFY_DONE; + + efibc_set_variable("LoaderEntryOneShot", (char *)data); + + return NOTIFY_DONE; +} + +static struct notifier_block efibc_reboot_notifier = { + .notifier_call = efibc_reboot_notifier_call, +}; + +static int __init efibc_init(void) +{ + int ret; + + if (!efi_enabled(EFI_RUNTIME_SERVICES)) + return -ENODEV; + + ret = register_reboot_notifier(&efibc_reboot_notifier); + if (ret) + pr_err("unable to register reboot notifier\n"); + + return ret; +} +module_init(efibc_init); + +static void __exit efibc_exit(void) +{ + unregister_reboot_notifier(&efibc_reboot_notifier); +} +module_exit(efibc_exit); + +MODULE_AUTHOR("Jeremy Compostella <jeremy.compostella@intel.com>"); +MODULE_AUTHOR("Matt Gumbel <matthew.k.gumbel@intel.com"); +MODULE_DESCRIPTION("EFI Bootloader Control"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/firmware/efi/efivars.c b/drivers/firmware/efi/efivars.c index 096adcbcb5a9..116b244dee68 100644 --- a/drivers/firmware/efi/efivars.c +++ b/drivers/firmware/efi/efivars.c @@ -661,7 +661,7 @@ static void efivar_update_sysfs_entries(struct work_struct *work) return; err = efivar_init(efivar_update_sysfs_entry, entry, - true, false, &efivar_sysfs_list); + false, &efivar_sysfs_list); if (!err) break; @@ -730,8 +730,7 @@ int efivars_sysfs_init(void) return -ENOMEM; } - efivar_init(efivars_sysfs_callback, NULL, false, - true, &efivar_sysfs_list); + efivar_init(efivars_sysfs_callback, NULL, true, &efivar_sysfs_list); error = create_efivars_bin_attributes(); if (error) { diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c index ed3a854950cc..48430aba13c1 100644 --- a/drivers/firmware/efi/fake_mem.c +++ b/drivers/firmware/efi/fake_mem.c @@ -57,7 +57,7 @@ static int __init cmp_fake_mem(const void *x1, const void *x2) void __init efi_fake_memmap(void) { u64 start, end, m_start, m_end, m_attr; - int new_nr_map = memmap.nr_map; + int new_nr_map = efi.memmap.nr_map; efi_memory_desc_t *md; phys_addr_t new_memmap_phy; void *new_memmap; @@ -68,8 +68,7 @@ void __init efi_fake_memmap(void) return; /* count up the number of EFI memory descriptor */ - for (old = memmap.map; old < memmap.map_end; old += memmap.desc_size) { - md = old; + for_each_efi_memory_desc(md) { start = md->phys_addr; end = start + (md->num_pages << EFI_PAGE_SHIFT) - 1; @@ -95,25 +94,25 @@ void __init efi_fake_memmap(void) } /* allocate memory for new EFI memmap */ - new_memmap_phy = memblock_alloc(memmap.desc_size * new_nr_map, + new_memmap_phy = memblock_alloc(efi.memmap.desc_size * new_nr_map, PAGE_SIZE); if (!new_memmap_phy) return; /* create new EFI memmap */ new_memmap = early_memremap(new_memmap_phy, - memmap.desc_size * new_nr_map); + efi.memmap.desc_size * new_nr_map); if (!new_memmap) { - memblock_free(new_memmap_phy, memmap.desc_size * new_nr_map); + memblock_free(new_memmap_phy, efi.memmap.desc_size * new_nr_map); return; } - for (old = memmap.map, new = new_memmap; - old < memmap.map_end; - old += memmap.desc_size, new += memmap.desc_size) { + for (old = efi.memmap.map, new = new_memmap; + old < efi.memmap.map_end; + old += efi.memmap.desc_size, new += efi.memmap.desc_size) { /* copy original EFI memory descriptor */ - memcpy(new, old, memmap.desc_size); + memcpy(new, old, efi.memmap.desc_size); md = new; start = md->phys_addr; end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1; @@ -134,8 +133,8 @@ void __init efi_fake_memmap(void) md->num_pages = (m_end - md->phys_addr + 1) >> EFI_PAGE_SHIFT; /* latter part */ - new += memmap.desc_size; - memcpy(new, old, memmap.desc_size); + new += efi.memmap.desc_size; + memcpy(new, old, efi.memmap.desc_size); md = new; md->phys_addr = m_end + 1; md->num_pages = (end - md->phys_addr + 1) >> @@ -147,16 +146,16 @@ void __init efi_fake_memmap(void) md->num_pages = (m_start - md->phys_addr) >> EFI_PAGE_SHIFT; /* middle part */ - new += memmap.desc_size; - memcpy(new, old, memmap.desc_size); + new += efi.memmap.desc_size; + memcpy(new, old, efi.memmap.desc_size); md = new; md->attribute |= m_attr; md->phys_addr = m_start; md->num_pages = (m_end - m_start + 1) >> EFI_PAGE_SHIFT; /* last part */ - new += memmap.desc_size; - memcpy(new, old, memmap.desc_size); + new += efi.memmap.desc_size; + memcpy(new, old, efi.memmap.desc_size); md = new; md->phys_addr = m_end + 1; md->num_pages = (end - m_end) >> @@ -169,8 +168,8 @@ void __init efi_fake_memmap(void) md->num_pages = (m_start - md->phys_addr) >> EFI_PAGE_SHIFT; /* latter part */ - new += memmap.desc_size; - memcpy(new, old, memmap.desc_size); + new += efi.memmap.desc_size; + memcpy(new, old, efi.memmap.desc_size); md = new; md->phys_addr = m_start; md->num_pages = (end - md->phys_addr + 1) >> @@ -182,10 +181,10 @@ void __init efi_fake_memmap(void) /* swap into new EFI memmap */ efi_unmap_memmap(); - memmap.map = new_memmap; - memmap.phys_map = new_memmap_phy; - memmap.nr_map = new_nr_map; - memmap.map_end = memmap.map + memmap.nr_map * memmap.desc_size; + efi.memmap.map = new_memmap; + efi.memmap.phys_map = new_memmap_phy; + efi.memmap.nr_map = new_nr_map; + efi.memmap.map_end = efi.memmap.map + efi.memmap.nr_map * efi.memmap.desc_size; set_bit(EFI_MEMMAP, &efi.flags); /* print new EFI memmap */ diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index da99bbb74aeb..c06945160a41 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -28,7 +28,7 @@ OBJECT_FILES_NON_STANDARD := y # Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. KCOV_INSTRUMENT := n -lib-y := efi-stub-helper.o +lib-y := efi-stub-helper.o gop.o # include the stub's generic dependencies from lib/ when building for ARM/arm64 arm-deps := fdt_rw.c fdt_ro.c fdt_wip.c fdt.c fdt_empty_tree.c fdt_sw.c sort.c diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c index 414deb85c2e5..993aa56755f6 100644 --- a/drivers/firmware/efi/libstub/arm-stub.c +++ b/drivers/firmware/efi/libstub/arm-stub.c @@ -20,27 +20,49 @@ bool __nokaslr; -static int efi_secureboot_enabled(efi_system_table_t *sys_table_arg) +static int efi_get_secureboot(efi_system_table_t *sys_table_arg) { - static efi_guid_t const var_guid = EFI_GLOBAL_VARIABLE_GUID; - static efi_char16_t const var_name[] = { + static efi_char16_t const sb_var_name[] = { 'S', 'e', 'c', 'u', 'r', 'e', 'B', 'o', 'o', 't', 0 }; + static efi_char16_t const sm_var_name[] = { + 'S', 'e', 't', 'u', 'p', 'M', 'o', 'd', 'e', 0 }; + efi_guid_t var_guid = EFI_GLOBAL_VARIABLE_GUID; efi_get_variable_t *f_getvar = sys_table_arg->runtime->get_variable; - unsigned long size = sizeof(u8); - efi_status_t status; u8 val; + unsigned long size = sizeof(val); + efi_status_t status; - status = f_getvar((efi_char16_t *)var_name, (efi_guid_t *)&var_guid, + status = f_getvar((efi_char16_t *)sb_var_name, (efi_guid_t *)&var_guid, NULL, &size, &val); + if (status != EFI_SUCCESS) + goto out_efi_err; + + if (val == 0) + return 0; + + status = f_getvar((efi_char16_t *)sm_var_name, (efi_guid_t *)&var_guid, + NULL, &size, &val); + + if (status != EFI_SUCCESS) + goto out_efi_err; + + if (val == 1) + return 0; + + return 1; + +out_efi_err: switch (status) { - case EFI_SUCCESS: - return val; case EFI_NOT_FOUND: return 0; + case EFI_DEVICE_ERROR: + return -EIO; + case EFI_SECURITY_VIOLATION: + return -EACCES; default: - return 1; + return -EINVAL; } } @@ -147,6 +169,25 @@ void efi_char16_printk(efi_system_table_t *sys_table_arg, out->output_string(out, str); } +static struct screen_info *setup_graphics(efi_system_table_t *sys_table_arg) +{ + efi_guid_t gop_proto = EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID; + efi_status_t status; + unsigned long size; + void **gop_handle = NULL; + struct screen_info *si = NULL; + + size = 0; + status = efi_call_early(locate_handle, EFI_LOCATE_BY_PROTOCOL, + &gop_proto, NULL, &size, gop_handle); + if (status == EFI_BUFFER_TOO_SMALL) { + si = alloc_screen_info(sys_table_arg); + if (!si) + return NULL; + efi_setup_gop(sys_table_arg, si, &gop_proto, size); + } + return si; +} /* * This function handles the architcture specific differences between arm and @@ -185,6 +226,8 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table, efi_guid_t loaded_image_proto = LOADED_IMAGE_PROTOCOL_GUID; unsigned long reserve_addr = 0; unsigned long reserve_size = 0; + int secure_boot = 0; + struct screen_info *si; /* Check if we were booted by the EFI firmware */ if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) @@ -237,6 +280,8 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table, __nokaslr = true; } + si = setup_graphics(sys_table); + status = handle_kernel_image(sys_table, image_addr, &image_size, &reserve_addr, &reserve_size, @@ -250,12 +295,21 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table, if (status != EFI_SUCCESS) pr_efi_err(sys_table, "Failed to parse EFI cmdline options\n"); + secure_boot = efi_get_secureboot(sys_table); + if (secure_boot > 0) + pr_efi(sys_table, "UEFI Secure Boot is enabled.\n"); + + if (secure_boot < 0) { + pr_efi_err(sys_table, + "could not determine UEFI Secure Boot status.\n"); + } + /* * Unauthenticated device tree data is a security hazard, so * ignore 'dtb=' unless UEFI Secure Boot is disabled. */ - if (efi_secureboot_enabled(sys_table)) { - pr_efi(sys_table, "UEFI Secure Boot is enabled.\n"); + if (secure_boot != 0 && strstr(cmdline_ptr, "dtb=")) { + pr_efi(sys_table, "Ignoring DTB from command line.\n"); } else { status = handle_cmdline_files(sys_table, image, cmdline_ptr, "dtb=", @@ -309,6 +363,7 @@ fail_free_image: efi_free(sys_table, image_size, *image_addr); efi_free(sys_table, reserve_size, reserve_addr); fail_free_cmdline: + free_screen_info(sys_table, si); efi_free(sys_table, cmdline_size, (unsigned long)cmdline_ptr); fail: return EFI_ERROR; diff --git a/drivers/firmware/efi/libstub/arm32-stub.c b/drivers/firmware/efi/libstub/arm32-stub.c index 6f42be4d0084..e1f0b28e1dcb 100644 --- a/drivers/firmware/efi/libstub/arm32-stub.c +++ b/drivers/firmware/efi/libstub/arm32-stub.c @@ -26,6 +26,43 @@ efi_status_t check_platform_features(efi_system_table_t *sys_table_arg) return EFI_SUCCESS; } +static efi_guid_t screen_info_guid = LINUX_EFI_ARM_SCREEN_INFO_TABLE_GUID; + +struct screen_info *alloc_screen_info(efi_system_table_t *sys_table_arg) +{ + struct screen_info *si; + efi_status_t status; + + /* + * Unlike on arm64, where we can directly fill out the screen_info + * structure from the stub, we need to allocate a buffer to hold + * its contents while we hand over to the kernel proper from the + * decompressor. + */ + status = efi_call_early(allocate_pool, EFI_RUNTIME_SERVICES_DATA, + sizeof(*si), (void **)&si); + + if (status != EFI_SUCCESS) + return NULL; + + status = efi_call_early(install_configuration_table, + &screen_info_guid, si); + if (status == EFI_SUCCESS) + return si; + + efi_call_early(free_pool, si); + return NULL; +} + +void free_screen_info(efi_system_table_t *sys_table_arg, struct screen_info *si) +{ + if (!si) + return; + + efi_call_early(install_configuration_table, &screen_info_guid, NULL); + efi_call_early(free_pool, si); +} + efi_status_t handle_kernel_image(efi_system_table_t *sys_table, unsigned long *image_addr, unsigned long *image_size, diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c index 29ed2f9b218c..3bd127f95315 100644 --- a/drivers/firmware/efi/libstub/efi-stub-helper.c +++ b/drivers/firmware/efi/libstub/efi-stub-helper.c @@ -125,10 +125,12 @@ unsigned long get_dram_base(efi_system_table_t *sys_table_arg) map.map_end = map.map + map_size; - for_each_efi_memory_desc(&map, md) - if (md->attribute & EFI_MEMORY_WB) + for_each_efi_memory_desc_in_map(&map, md) { + if (md->attribute & EFI_MEMORY_WB) { if (membase > md->phys_addr) membase = md->phys_addr; + } + } efi_call_early(free_pool, map.map); diff --git a/drivers/firmware/efi/libstub/gop.c b/drivers/firmware/efi/libstub/gop.c new file mode 100644 index 000000000000..932742e4cf23 --- /dev/null +++ b/drivers/firmware/efi/libstub/gop.c @@ -0,0 +1,354 @@ +/* ----------------------------------------------------------------------- + * + * Copyright 2011 Intel Corporation; author Matt Fleming + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +#include <linux/efi.h> +#include <linux/screen_info.h> +#include <asm/efi.h> +#include <asm/setup.h> + +static void find_bits(unsigned long mask, u8 *pos, u8 *size) +{ + u8 first, len; + + first = 0; + len = 0; + + if (mask) { + while (!(mask & 0x1)) { + mask = mask >> 1; + first++; + } + + while (mask & 0x1) { + mask = mask >> 1; + len++; + } + } + + *pos = first; + *size = len; +} + +static void +setup_pixel_info(struct screen_info *si, u32 pixels_per_scan_line, + struct efi_pixel_bitmask pixel_info, int pixel_format) +{ + if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) { + si->lfb_depth = 32; + si->lfb_linelength = pixels_per_scan_line * 4; + si->red_size = 8; + si->red_pos = 0; + si->green_size = 8; + si->green_pos = 8; + si->blue_size = 8; + si->blue_pos = 16; + si->rsvd_size = 8; + si->rsvd_pos = 24; + } else if (pixel_format == PIXEL_BGR_RESERVED_8BIT_PER_COLOR) { + si->lfb_depth = 32; + si->lfb_linelength = pixels_per_scan_line * 4; + si->red_size = 8; + si->red_pos = 16; + si->green_size = 8; + si->green_pos = 8; + si->blue_size = 8; + si->blue_pos = 0; + si->rsvd_size = 8; + si->rsvd_pos = 24; + } else if (pixel_format == PIXEL_BIT_MASK) { + find_bits(pixel_info.red_mask, &si->red_pos, &si->red_size); + find_bits(pixel_info.green_mask, &si->green_pos, + &si->green_size); + find_bits(pixel_info.blue_mask, &si->blue_pos, &si->blue_size); + find_bits(pixel_info.reserved_mask, &si->rsvd_pos, + &si->rsvd_size); + si->lfb_depth = si->red_size + si->green_size + + si->blue_size + si->rsvd_size; + si->lfb_linelength = (pixels_per_scan_line * si->lfb_depth) / 8; + } else { + si->lfb_depth = 4; + si->lfb_linelength = si->lfb_width / 2; + si->red_size = 0; + si->red_pos = 0; + si->green_size = 0; + si->green_pos = 0; + si->blue_size = 0; + si->blue_pos = 0; + si->rsvd_size = 0; + si->rsvd_pos = 0; + } +} + +static efi_status_t +__gop_query32(efi_system_table_t *sys_table_arg, + struct efi_graphics_output_protocol_32 *gop32, + struct efi_graphics_output_mode_info **info, + unsigned long *size, u64 *fb_base) +{ + struct efi_graphics_output_protocol_mode_32 *mode; + efi_graphics_output_protocol_query_mode query_mode; + efi_status_t status; + unsigned long m; + + m = gop32->mode; + mode = (struct efi_graphics_output_protocol_mode_32 *)m; + query_mode = (void *)(unsigned long)gop32->query_mode; + + status = __efi_call_early(query_mode, (void *)gop32, mode->mode, size, + info); + if (status != EFI_SUCCESS) + return status; + + *fb_base = mode->frame_buffer_base; + return status; +} + +static efi_status_t +setup_gop32(efi_system_table_t *sys_table_arg, struct screen_info *si, + efi_guid_t *proto, unsigned long size, void **gop_handle) +{ + struct efi_graphics_output_protocol_32 *gop32, *first_gop; + unsigned long nr_gops; + u16 width, height; + u32 pixels_per_scan_line; + u32 ext_lfb_base; + u64 fb_base; + struct efi_pixel_bitmask pixel_info; + int pixel_format; + efi_status_t status = EFI_NOT_FOUND; + u32 *handles = (u32 *)(unsigned long)gop_handle; + int i; + + first_gop = NULL; + gop32 = NULL; + + nr_gops = size / sizeof(u32); + for (i = 0; i < nr_gops; i++) { + struct efi_graphics_output_mode_info *info = NULL; + efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID; + bool conout_found = false; + void *dummy = NULL; + efi_handle_t h = (efi_handle_t)(unsigned long)handles[i]; + u64 current_fb_base; + + status = efi_call_early(handle_protocol, h, + proto, (void **)&gop32); + if (status != EFI_SUCCESS) + continue; + + status = efi_call_early(handle_protocol, h, + &conout_proto, &dummy); + if (status == EFI_SUCCESS) + conout_found = true; + + status = __gop_query32(sys_table_arg, gop32, &info, &size, + ¤t_fb_base); + if (status == EFI_SUCCESS && (!first_gop || conout_found)) { + /* + * Systems that use the UEFI Console Splitter may + * provide multiple GOP devices, not all of which are + * backed by real hardware. The workaround is to search + * for a GOP implementing the ConOut protocol, and if + * one isn't found, to just fall back to the first GOP. + */ + width = info->horizontal_resolution; + height = info->vertical_resolution; + pixel_format = info->pixel_format; + pixel_info = info->pixel_information; + pixels_per_scan_line = info->pixels_per_scan_line; + fb_base = current_fb_base; + + /* + * Once we've found a GOP supporting ConOut, + * don't bother looking any further. + */ + first_gop = gop32; + if (conout_found) + break; + } + } + + /* Did we find any GOPs? */ + if (!first_gop) + goto out; + + /* EFI framebuffer */ + si->orig_video_isVGA = VIDEO_TYPE_EFI; + + si->lfb_width = width; + si->lfb_height = height; + si->lfb_base = fb_base; + + ext_lfb_base = (u64)(unsigned long)fb_base >> 32; + if (ext_lfb_base) { + si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE; + si->ext_lfb_base = ext_lfb_base; + } + + si->pages = 1; + + setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format); + + si->lfb_size = si->lfb_linelength * si->lfb_height; + + si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS; +out: + return status; +} + +static efi_status_t +__gop_query64(efi_system_table_t *sys_table_arg, + struct efi_graphics_output_protocol_64 *gop64, + struct efi_graphics_output_mode_info **info, + unsigned long *size, u64 *fb_base) +{ + struct efi_graphics_output_protocol_mode_64 *mode; + efi_graphics_output_protocol_query_mode query_mode; + efi_status_t status; + unsigned long m; + + m = gop64->mode; + mode = (struct efi_graphics_output_protocol_mode_64 *)m; + query_mode = (void *)(unsigned long)gop64->query_mode; + + status = __efi_call_early(query_mode, (void *)gop64, mode->mode, size, + info); + if (status != EFI_SUCCESS) + return status; + + *fb_base = mode->frame_buffer_base; + return status; +} + +static efi_status_t +setup_gop64(efi_system_table_t *sys_table_arg, struct screen_info *si, + efi_guid_t *proto, unsigned long size, void **gop_handle) +{ + struct efi_graphics_output_protocol_64 *gop64, *first_gop; + unsigned long nr_gops; + u16 width, height; + u32 pixels_per_scan_line; + u32 ext_lfb_base; + u64 fb_base; + struct efi_pixel_bitmask pixel_info; + int pixel_format; + efi_status_t status = EFI_NOT_FOUND; + u64 *handles = (u64 *)(unsigned long)gop_handle; + int i; + + first_gop = NULL; + gop64 = NULL; + + nr_gops = size / sizeof(u64); + for (i = 0; i < nr_gops; i++) { + struct efi_graphics_output_mode_info *info = NULL; + efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID; + bool conout_found = false; + void *dummy = NULL; + efi_handle_t h = (efi_handle_t)(unsigned long)handles[i]; + u64 current_fb_base; + + status = efi_call_early(handle_protocol, h, + proto, (void **)&gop64); + if (status != EFI_SUCCESS) + continue; + + status = efi_call_early(handle_protocol, h, + &conout_proto, &dummy); + if (status == EFI_SUCCESS) + conout_found = true; + + status = __gop_query64(sys_table_arg, gop64, &info, &size, + ¤t_fb_base); + if (status == EFI_SUCCESS && (!first_gop || conout_found)) { + /* + * Systems that use the UEFI Console Splitter may + * provide multiple GOP devices, not all of which are + * backed by real hardware. The workaround is to search + * for a GOP implementing the ConOut protocol, and if + * one isn't found, to just fall back to the first GOP. + */ + width = info->horizontal_resolution; + height = info->vertical_resolution; + pixel_format = info->pixel_format; + pixel_info = info->pixel_information; + pixels_per_scan_line = info->pixels_per_scan_line; + fb_base = current_fb_base; + + /* + * Once we've found a GOP supporting ConOut, + * don't bother looking any further. + */ + first_gop = gop64; + if (conout_found) + break; + } + } + + /* Did we find any GOPs? */ + if (!first_gop) + goto out; + + /* EFI framebuffer */ + si->orig_video_isVGA = VIDEO_TYPE_EFI; + + si->lfb_width = width; + si->lfb_height = height; + si->lfb_base = fb_base; + + ext_lfb_base = (u64)(unsigned long)fb_base >> 32; + if (ext_lfb_base) { + si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE; + si->ext_lfb_base = ext_lfb_base; + } + + si->pages = 1; + + setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format); + + si->lfb_size = si->lfb_linelength * si->lfb_height; + + si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS; +out: + return status; +} + +/* + * See if we have Graphics Output Protocol + */ +efi_status_t efi_setup_gop(efi_system_table_t *sys_table_arg, + struct screen_info *si, efi_guid_t *proto, + unsigned long size) +{ + efi_status_t status; + void **gop_handle = NULL; + + status = efi_call_early(allocate_pool, EFI_LOADER_DATA, + size, (void **)&gop_handle); + if (status != EFI_SUCCESS) + return status; + + status = efi_call_early(locate_handle, + EFI_LOCATE_BY_PROTOCOL, + proto, NULL, &size, gop_handle); + if (status != EFI_SUCCESS) + goto free_handle; + + if (efi_is_64bit()) { + status = setup_gop64(sys_table_arg, si, proto, size, + gop_handle); + } else { + status = setup_gop32(sys_table_arg, si, proto, size, + gop_handle); + } + +free_handle: + efi_call_early(free_pool, gop_handle); + return status; +} diff --git a/drivers/firmware/efi/memattr.c b/drivers/firmware/efi/memattr.c new file mode 100644 index 000000000000..236004b9a50d --- /dev/null +++ b/drivers/firmware/efi/memattr.c @@ -0,0 +1,182 @@ +/* + * Copyright (C) 2016 Linaro Ltd. <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define pr_fmt(fmt) "efi: memattr: " fmt + +#include <linux/efi.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/memblock.h> + +#include <asm/early_ioremap.h> + +static int __initdata tbl_size; + +/* + * Reserve the memory associated with the Memory Attributes configuration + * table, if it exists. + */ +int __init efi_memattr_init(void) +{ + efi_memory_attributes_table_t *tbl; + + if (efi.mem_attr_table == EFI_INVALID_TABLE_ADDR) + return 0; + + tbl = early_memremap(efi.mem_attr_table, sizeof(*tbl)); + if (!tbl) { + pr_err("Failed to map EFI Memory Attributes table @ 0x%lx\n", + efi.mem_attr_table); + return -ENOMEM; + } + + if (tbl->version > 1) { + pr_warn("Unexpected EFI Memory Attributes table version %d\n", + tbl->version); + goto unmap; + } + + tbl_size = sizeof(*tbl) + tbl->num_entries * tbl->desc_size; + memblock_reserve(efi.mem_attr_table, tbl_size); + +unmap: + early_memunmap(tbl, sizeof(*tbl)); + return 0; +} + +/* + * Returns a copy @out of the UEFI memory descriptor @in if it is covered + * entirely by a UEFI memory map entry with matching attributes. The virtual + * address of @out is set according to the matching entry that was found. + */ +static bool entry_is_valid(const efi_memory_desc_t *in, efi_memory_desc_t *out) +{ + u64 in_paddr = in->phys_addr; + u64 in_size = in->num_pages << EFI_PAGE_SHIFT; + efi_memory_desc_t *md; + + *out = *in; + + if (in->type != EFI_RUNTIME_SERVICES_CODE && + in->type != EFI_RUNTIME_SERVICES_DATA) { + pr_warn("Entry type should be RuntimeServiceCode/Data\n"); + return false; + } + + if (!(in->attribute & (EFI_MEMORY_RO | EFI_MEMORY_XP))) { + pr_warn("Entry attributes invalid: RO and XP bits both cleared\n"); + return false; + } + + if (PAGE_SIZE > EFI_PAGE_SIZE && + (!PAGE_ALIGNED(in->phys_addr) || + !PAGE_ALIGNED(in->num_pages << EFI_PAGE_SHIFT))) { + /* + * Since arm64 may execute with page sizes of up to 64 KB, the + * UEFI spec mandates that RuntimeServices memory regions must + * be 64 KB aligned. We need to validate this here since we will + * not be able to tighten permissions on such regions without + * affecting adjacent regions. + */ + pr_warn("Entry address region misaligned\n"); + return false; + } + + for_each_efi_memory_desc(md) { + u64 md_paddr = md->phys_addr; + u64 md_size = md->num_pages << EFI_PAGE_SHIFT; + + if (!(md->attribute & EFI_MEMORY_RUNTIME)) + continue; + if (md->virt_addr == 0) { + /* no virtual mapping has been installed by the stub */ + break; + } + + if (md_paddr > in_paddr || (in_paddr - md_paddr) >= md_size) + continue; + + /* + * This entry covers the start of @in, check whether + * it covers the end as well. + */ + if (md_paddr + md_size < in_paddr + in_size) { + pr_warn("Entry covers multiple EFI memory map regions\n"); + return false; + } + + if (md->type != in->type) { + pr_warn("Entry type deviates from EFI memory map region type\n"); + return false; + } + + out->virt_addr = in_paddr + (md->virt_addr - md_paddr); + + return true; + } + + pr_warn("No matching entry found in the EFI memory map\n"); + return false; +} + +/* + * To be called after the EFI page tables have been populated. If a memory + * attributes table is available, its contents will be used to update the + * mappings with tightened permissions as described by the table. + * This requires the UEFI memory map to have already been populated with + * virtual addresses. + */ +int __init efi_memattr_apply_permissions(struct mm_struct *mm, + efi_memattr_perm_setter fn) +{ + efi_memory_attributes_table_t *tbl; + int i, ret; + + if (tbl_size <= sizeof(*tbl)) + return 0; + + /* + * We need the EFI memory map to be setup so we can use it to + * lookup the virtual addresses of all entries in the of EFI + * Memory Attributes table. If it isn't available, this + * function should not be called. + */ + if (WARN_ON(!efi_enabled(EFI_MEMMAP))) + return 0; + + tbl = memremap(efi.mem_attr_table, tbl_size, MEMREMAP_WB); + if (!tbl) { + pr_err("Failed to map EFI Memory Attributes table @ 0x%lx\n", + efi.mem_attr_table); + return -ENOMEM; + } + + if (efi_enabled(EFI_DBG)) + pr_info("Processing EFI Memory Attributes table:\n"); + + for (i = ret = 0; ret == 0 && i < tbl->num_entries; i++) { + efi_memory_desc_t md; + unsigned long size; + bool valid; + char buf[64]; + + valid = entry_is_valid((void *)tbl->entry + i * tbl->desc_size, + &md); + size = md.num_pages << EFI_PAGE_SHIFT; + if (efi_enabled(EFI_DBG) || !valid) + pr_info("%s 0x%012llx-0x%012llx %s\n", + valid ? "" : "!", md.phys_addr, + md.phys_addr + size - 1, + efi_md_typeattr_format(buf, sizeof(buf), &md)); + + if (valid) + ret = fn(mm, &md); + } + memunmap(tbl); + return ret; +} diff --git a/drivers/firmware/efi/reboot.c b/drivers/firmware/efi/reboot.c index 9c59d1c795d1..62ead9b9d871 100644 --- a/drivers/firmware/efi/reboot.c +++ b/drivers/firmware/efi/reboot.c @@ -9,7 +9,8 @@ int efi_reboot_quirk_mode = -1; void efi_reboot(enum reboot_mode reboot_mode, const char *__unused) { - int efi_mode; + const char *str[] = { "cold", "warm", "shutdown", "platform" }; + int efi_mode, cap_reset_mode; if (!efi_enabled(EFI_RUNTIME_SERVICES)) return; @@ -30,6 +31,15 @@ void efi_reboot(enum reboot_mode reboot_mode, const char *__unused) if (efi_reboot_quirk_mode != -1) efi_mode = efi_reboot_quirk_mode; + if (efi_capsule_pending(&cap_reset_mode)) { + if (efi_mode != cap_reset_mode) + printk(KERN_CRIT "efi: %s reset requested but pending " + "capsule update requires %s reset... Performing " + "%s reset.\n", str[efi_mode], str[cap_reset_mode], + str[cap_reset_mode]); + efi_mode = cap_reset_mode; + } + efi.reset_system(efi_mode, EFI_SUCCESS, 0, NULL); } diff --git a/drivers/firmware/efi/runtime-wrappers.c b/drivers/firmware/efi/runtime-wrappers.c index de6953039af6..23bef6bb73ee 100644 --- a/drivers/firmware/efi/runtime-wrappers.c +++ b/drivers/firmware/efi/runtime-wrappers.c @@ -16,10 +16,70 @@ #include <linux/bug.h> #include <linux/efi.h> +#include <linux/irqflags.h> #include <linux/mutex.h> #include <linux/spinlock.h> +#include <linux/stringify.h> #include <asm/efi.h> +static void efi_call_virt_check_flags(unsigned long flags, const char *call) +{ + unsigned long cur_flags, mismatch; + + local_save_flags(cur_flags); + + mismatch = flags ^ cur_flags; + if (!WARN_ON_ONCE(mismatch & ARCH_EFI_IRQ_FLAGS_MASK)) + return; + + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_NOW_UNRELIABLE); + pr_err_ratelimited(FW_BUG "IRQ flags corrupted (0x%08lx=>0x%08lx) by EFI %s\n", + flags, cur_flags, call); + local_irq_restore(flags); +} + +/* + * Arch code can implement the following three template macros, avoiding + * reptition for the void/non-void return cases of {__,}efi_call_virt: + * + * * arch_efi_call_virt_setup + * + * Sets up the environment for the call (e.g. switching page tables, + * allowing kernel-mode use of floating point, if required). + * + * * arch_efi_call_virt + * + * Performs the call. The last expression in the macro must be the call + * itself, allowing the logic to be shared by the void and non-void + * cases. + * + * * arch_efi_call_virt_teardown + * + * Restores the usual kernel environment once the call has returned. + */ + +#define efi_call_virt(f, args...) \ +({ \ + efi_status_t __s; \ + unsigned long flags; \ + arch_efi_call_virt_setup(); \ + local_save_flags(flags); \ + __s = arch_efi_call_virt(f, args); \ + efi_call_virt_check_flags(flags, __stringify(f)); \ + arch_efi_call_virt_teardown(); \ + __s; \ +}) + +#define __efi_call_virt(f, args...) \ +({ \ + unsigned long flags; \ + arch_efi_call_virt_setup(); \ + local_save_flags(flags); \ + arch_efi_call_virt(f, args); \ + efi_call_virt_check_flags(flags, __stringify(f)); \ + arch_efi_call_virt_teardown(); \ +}) + /* * According to section 7.1 of the UEFI spec, Runtime Services are not fully * reentrant, and there are particular combinations of calls that need to be diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c index 34b741940494..d3b751383286 100644 --- a/drivers/firmware/efi/vars.c +++ b/drivers/firmware/efi/vars.c @@ -329,39 +329,6 @@ check_var_size_nonblocking(u32 attributes, unsigned long size) return fops->query_variable_store(attributes, size, true); } -static int efi_status_to_err(efi_status_t status) -{ - int err; - - switch (status) { - case EFI_SUCCESS: - err = 0; - break; - case EFI_INVALID_PARAMETER: - err = -EINVAL; - break; - case EFI_OUT_OF_RESOURCES: - err = -ENOSPC; - break; - case EFI_DEVICE_ERROR: - err = -EIO; - break; - case EFI_WRITE_PROTECTED: - err = -EROFS; - break; - case EFI_SECURITY_VIOLATION: - err = -EACCES; - break; - case EFI_NOT_FOUND: - err = -ENOENT; - break; - default: - err = -EINVAL; - } - - return err; -} - static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor, struct list_head *head) { @@ -452,8 +419,7 @@ static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid, * Returns 0 on success, or a kernel error code on failure. */ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), - void *data, bool atomic, bool duplicates, - struct list_head *head) + void *data, bool duplicates, struct list_head *head) { const struct efivar_operations *ops = __efivars->ops; unsigned long variable_name_size = 1024; @@ -483,7 +449,7 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), &vendor_guid); switch (status) { case EFI_SUCCESS: - if (!atomic) + if (duplicates) spin_unlock_irq(&__efivars->lock); variable_name_size = var_name_strnsize(variable_name, @@ -498,21 +464,19 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), * and may end up looping here forever. */ if (duplicates && - variable_is_present(variable_name, &vendor_guid, head)) { + variable_is_present(variable_name, &vendor_guid, + head)) { dup_variable_bug(variable_name, &vendor_guid, variable_name_size); - if (!atomic) - spin_lock_irq(&__efivars->lock); - status = EFI_NOT_FOUND; - break; + } else { + err = func(variable_name, vendor_guid, + variable_name_size, data); + if (err) + status = EFI_NOT_FOUND; } - err = func(variable_name, vendor_guid, variable_name_size, data); - if (err) - status = EFI_NOT_FOUND; - - if (!atomic) + if (duplicates) spin_lock_irq(&__efivars->lock); break; diff --git a/drivers/gpu/drm/amd/amdgpu/atombios_dp.c b/drivers/gpu/drm/amd/amdgpu/atombios_dp.c index bf731e9f643e..7f85c2c1d681 100644 --- a/drivers/gpu/drm/amd/amdgpu/atombios_dp.c +++ b/drivers/gpu/drm/amd/amdgpu/atombios_dp.c @@ -276,8 +276,8 @@ static int amdgpu_atombios_dp_get_dp_link_config(struct drm_connector *connector } } } else { - for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) { - for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) { + for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) { + for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) { max_pix_clock = (lane_num * link_rates[i] * 8) / bpp; if (max_pix_clock >= pix_clock) { *dp_lanes = lane_num; diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index a0f1bd711b53..e3f4c725a1c6 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -2872,20 +2872,6 @@ static void intel_dp_info(struct seq_file *m, intel_panel_info(m, &intel_connector->panel); } -static void intel_dp_mst_info(struct seq_file *m, - struct intel_connector *intel_connector) -{ - struct intel_encoder *intel_encoder = intel_connector->encoder; - struct intel_dp_mst_encoder *intel_mst = - enc_to_mst(&intel_encoder->base); - struct intel_digital_port *intel_dig_port = intel_mst->primary; - struct intel_dp *intel_dp = &intel_dig_port->dp; - bool has_audio = drm_dp_mst_port_has_audio(&intel_dp->mst_mgr, - intel_connector->port); - - seq_printf(m, "\taudio support: %s\n", yesno(has_audio)); -} - static void intel_hdmi_info(struct seq_file *m, struct intel_connector *intel_connector) { @@ -2929,8 +2915,6 @@ static void intel_connector_info(struct seq_file *m, intel_hdmi_info(m, intel_connector); else if (intel_encoder->type == INTEL_OUTPUT_LVDS) intel_lvds_info(m, intel_connector); - else if (intel_encoder->type == INTEL_OUTPUT_DP_MST) - intel_dp_mst_info(m, intel_connector); } seq_printf(m, "\tmodes:\n"); diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index fffdac801d3b..363bd79dea2e 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -7444,6 +7444,8 @@ enum skl_disp_power_wells { #define TRANS_CLK_SEL_DISABLED (0x0<<29) #define TRANS_CLK_SEL_PORT(x) (((x)+1)<<29) +#define CDCLK_FREQ _MMIO(0x46200) + #define _TRANSA_MSA_MISC 0x60410 #define _TRANSB_MSA_MISC 0x61410 #define _TRANSC_MSA_MISC 0x62410 diff --git a/drivers/gpu/drm/i915/intel_audio.c b/drivers/gpu/drm/i915/intel_audio.c index 30f921421b0c..7d281b40064a 100644 --- a/drivers/gpu/drm/i915/intel_audio.c +++ b/drivers/gpu/drm/i915/intel_audio.c @@ -262,8 +262,7 @@ static void hsw_audio_codec_disable(struct intel_encoder *encoder) tmp |= AUD_CONFIG_N_PROG_ENABLE; tmp &= ~AUD_CONFIG_UPPER_N_MASK; tmp &= ~AUD_CONFIG_LOWER_N_MASK; - if (intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DISPLAYPORT) || - intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DP_MST)) + if (intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DISPLAYPORT)) tmp |= AUD_CONFIG_N_VALUE_INDEX; I915_WRITE(HSW_AUD_CFG(pipe), tmp); @@ -476,8 +475,7 @@ static void ilk_audio_codec_enable(struct drm_connector *connector, tmp &= ~AUD_CONFIG_N_VALUE_INDEX; tmp &= ~AUD_CONFIG_N_PROG_ENABLE; tmp &= ~AUD_CONFIG_PIXEL_CLOCK_HDMI_MASK; - if (intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DISPLAYPORT) || - intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DP_MST)) + if (intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DISPLAYPORT)) tmp |= AUD_CONFIG_N_VALUE_INDEX; else tmp |= audio_config_hdmi_pixel_clock(adjusted_mode); @@ -515,8 +513,7 @@ void intel_audio_codec_enable(struct intel_encoder *intel_encoder) /* ELD Conn_Type */ connector->eld[5] &= ~(3 << 2); - if (intel_pipe_has_type(crtc, INTEL_OUTPUT_DISPLAYPORT) || - intel_pipe_has_type(crtc, INTEL_OUTPUT_DP_MST)) + if (intel_pipe_has_type(crtc, INTEL_OUTPUT_DISPLAYPORT)) connector->eld[5] |= (1 << 2); connector->eld[6] = drm_av_sync_delay(connector, adjusted_mode) / 2; diff --git a/drivers/gpu/drm/i915/intel_crt.c b/drivers/gpu/drm/i915/intel_crt.c index 505fc5cf26f8..0364292367b1 100644 --- a/drivers/gpu/drm/i915/intel_crt.c +++ b/drivers/gpu/drm/i915/intel_crt.c @@ -257,8 +257,14 @@ static bool intel_crt_compute_config(struct intel_encoder *encoder, pipe_config->has_pch_encoder = true; /* LPT FDI RX only supports 8bpc. */ - if (HAS_PCH_LPT(dev)) + if (HAS_PCH_LPT(dev)) { + if (pipe_config->bw_constrained && pipe_config->pipe_bpp < 24) { + DRM_DEBUG_KMS("LPT only supports 24bpp\n"); + return false; + } + pipe_config->pipe_bpp = 24; + } /* FDI must always be 2.7 GHz */ if (HAS_DDI(dev)) { diff --git a/drivers/gpu/drm/i915/intel_ddi.c b/drivers/gpu/drm/i915/intel_ddi.c index 3b57bf06abe8..96ffcc541e17 100644 --- a/drivers/gpu/drm/i915/intel_ddi.c +++ b/drivers/gpu/drm/i915/intel_ddi.c @@ -3106,23 +3106,6 @@ void intel_ddi_fdi_disable(struct drm_crtc *crtc) I915_WRITE(FDI_RX_CTL(PIPE_A), val); } -bool intel_ddi_is_audio_enabled(struct drm_i915_private *dev_priv, - struct intel_crtc *intel_crtc) -{ - u32 temp; - - if (intel_display_power_get_if_enabled(dev_priv, POWER_DOMAIN_AUDIO)) { - temp = I915_READ(HSW_AUD_PIN_ELD_CP_VLD); - - intel_display_power_put(dev_priv, POWER_DOMAIN_AUDIO); - - if (temp & AUDIO_OUTPUT_ENABLE(intel_crtc->pipe)) - return true; - } - - return false; -} - void intel_ddi_get_config(struct intel_encoder *encoder, struct intel_crtc_state *pipe_config) { @@ -3183,8 +3166,11 @@ void intel_ddi_get_config(struct intel_encoder *encoder, break; } - pipe_config->has_audio = - intel_ddi_is_audio_enabled(dev_priv, intel_crtc); + if (intel_display_power_is_enabled(dev_priv, POWER_DOMAIN_AUDIO)) { + temp = I915_READ(HSW_AUD_PIN_ELD_CP_VLD); + if (temp & AUDIO_OUTPUT_ENABLE(intel_crtc->pipe)) + pipe_config->has_audio = true; + } if (encoder->type == INTEL_OUTPUT_EDP && dev_priv->vbt.edp_bpp && pipe_config->pipe_bpp > dev_priv->vbt.edp_bpp) { diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 182f84937345..0104a06d01fd 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -7988,9 +7988,6 @@ static void i9xx_get_pfit_config(struct intel_crtc *crtc, pipe_config->gmch_pfit.control = tmp; pipe_config->gmch_pfit.pgm_ratios = I915_READ(PFIT_PGM_RATIOS); - if (INTEL_INFO(dev)->gen < 5) - pipe_config->gmch_pfit.lvds_border_bits = - I915_READ(LVDS) & LVDS_BORDER_ENABLE; } static void vlv_crtc_clock_get(struct intel_crtc *crtc, @@ -9752,6 +9749,8 @@ static void broadwell_set_cdclk(struct drm_device *dev, int cdclk) sandybridge_pcode_write(dev_priv, HSW_PCODE_DE_WRITE_FREQ_REQ, data); mutex_unlock(&dev_priv->rps.hw_lock); + I915_WRITE(CDCLK_FREQ, DIV_ROUND_CLOSEST(cdclk, 1000) - 1); + intel_update_cdclk(dev); WARN(cdclk != dev_priv->cdclk_freq, diff --git a/drivers/gpu/drm/i915/intel_dp_mst.c b/drivers/gpu/drm/i915/intel_dp_mst.c index 937e77228466..2c999725b3d4 100644 --- a/drivers/gpu/drm/i915/intel_dp_mst.c +++ b/drivers/gpu/drm/i915/intel_dp_mst.c @@ -78,8 +78,6 @@ static bool intel_dp_mst_compute_config(struct intel_encoder *encoder, return false; } - if (drm_dp_mst_port_has_audio(&intel_dp->mst_mgr, found->port)) - pipe_config->has_audio = true; mst_pbn = drm_dp_calc_pbn_mode(adjusted_mode->crtc_clock, bpp); pipe_config->pbn = mst_pbn; @@ -104,11 +102,6 @@ static void intel_mst_disable_dp(struct intel_encoder *encoder) struct intel_dp_mst_encoder *intel_mst = enc_to_mst(&encoder->base); struct intel_digital_port *intel_dig_port = intel_mst->primary; struct intel_dp *intel_dp = &intel_dig_port->dp; - struct drm_device *dev = encoder->base.dev; - struct drm_i915_private *dev_priv = dev->dev_private; - struct drm_crtc *crtc = encoder->base.crtc; - struct intel_crtc *intel_crtc = to_intel_crtc(crtc); - int ret; DRM_DEBUG_KMS("%d\n", intel_dp->active_mst_links); @@ -119,10 +112,6 @@ static void intel_mst_disable_dp(struct intel_encoder *encoder) if (ret) { DRM_ERROR("failed to update payload %d\n", ret); } - if (intel_crtc->config->has_audio) { - intel_audio_codec_disable(encoder); - intel_display_power_put(dev_priv, POWER_DOMAIN_AUDIO); - } } static void intel_mst_post_disable_dp(struct intel_encoder *encoder) @@ -221,7 +210,6 @@ static void intel_mst_enable_dp(struct intel_encoder *encoder) struct intel_dp *intel_dp = &intel_dig_port->dp; struct drm_device *dev = intel_dig_port->base.base.dev; struct drm_i915_private *dev_priv = dev->dev_private; - struct intel_crtc *crtc = to_intel_crtc(encoder->base.crtc); enum port port = intel_dig_port->port; int ret; @@ -234,13 +222,6 @@ static void intel_mst_enable_dp(struct intel_encoder *encoder) ret = drm_dp_check_act_status(&intel_dp->mst_mgr); ret = drm_dp_update_payload_part2(&intel_dp->mst_mgr); - - if (crtc->config->has_audio) { - DRM_DEBUG_DRIVER("Enabling DP audio on pipe %c\n", - pipe_name(crtc->pipe)); - intel_display_power_get(dev_priv, POWER_DOMAIN_AUDIO); - intel_audio_codec_enable(encoder); - } } static bool intel_dp_mst_enc_get_hw_state(struct intel_encoder *encoder, @@ -266,9 +247,6 @@ static void intel_dp_mst_enc_get_config(struct intel_encoder *encoder, pipe_config->has_dp_encoder = true; - pipe_config->has_audio = - intel_ddi_is_audio_enabled(dev_priv, crtc); - temp = I915_READ(TRANS_DDI_FUNC_CTL(cpu_transcoder)); if (temp & TRANS_DDI_PHSYNC) flags |= DRM_MODE_FLAG_PHSYNC; diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h index 7d3af3a72abe..9d0770c23fde 100644 --- a/drivers/gpu/drm/i915/intel_drv.h +++ b/drivers/gpu/drm/i915/intel_drv.h @@ -1019,8 +1019,6 @@ void intel_ddi_set_pipe_settings(struct drm_crtc *crtc); void intel_ddi_prepare_link_retrain(struct intel_dp *intel_dp); bool intel_ddi_connector_get_hw_state(struct intel_connector *intel_connector); void intel_ddi_fdi_disable(struct drm_crtc *crtc); -bool intel_ddi_is_audio_enabled(struct drm_i915_private *dev_priv, - struct intel_crtc *intel_crtc); void intel_ddi_get_config(struct intel_encoder *encoder, struct intel_crtc_state *pipe_config); struct intel_encoder * diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c index cd9fe609aefb..10dc3517b63b 100644 --- a/drivers/gpu/drm/i915/intel_lvds.c +++ b/drivers/gpu/drm/i915/intel_lvds.c @@ -123,6 +123,10 @@ static void intel_lvds_get_config(struct intel_encoder *encoder, pipe_config->base.adjusted_mode.flags |= flags; + if (INTEL_INFO(dev)->gen < 5) + pipe_config->gmch_pfit.lvds_border_bits = + tmp & LVDS_BORDER_ENABLE; + /* gen2/3 store dither state in pfit control, needs to match */ if (INTEL_INFO(dev)->gen < 4) { tmp = I915_READ(PFIT_CONTROL); diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c index 8ed3cf34f82d..3425d8e737b3 100644 --- a/drivers/gpu/drm/i915/intel_pm.c +++ b/drivers/gpu/drm/i915/intel_pm.c @@ -6646,6 +6646,12 @@ static void broadwell_init_clock_gating(struct drm_device *dev) misccpctl = I915_READ(GEN7_MISCCPCTL); I915_WRITE(GEN7_MISCCPCTL, misccpctl & ~GEN7_DOP_CLOCK_GATE_ENABLE); I915_WRITE(GEN8_L3SQCREG1, BDW_WA_L3SQCREG1_DEFAULT); + /* + * Wait at least 100 clocks before re-enabling clock gating. See + * the definition of L3SQCREG1 in BSpec. + */ + POSTING_READ(GEN8_L3SQCREG1); + udelay(1); I915_WRITE(GEN7_MISCCPCTL, misccpctl); /* diff --git a/drivers/gpu/drm/radeon/atombios_crtc.c b/drivers/gpu/drm/radeon/atombios_crtc.c index b80b08f71cb4..532127c55de6 100644 --- a/drivers/gpu/drm/radeon/atombios_crtc.c +++ b/drivers/gpu/drm/radeon/atombios_crtc.c @@ -1742,6 +1742,7 @@ static u32 radeon_get_pll_use_mask(struct drm_crtc *crtc) static int radeon_get_shared_dp_ppll(struct drm_crtc *crtc) { struct drm_device *dev = crtc->dev; + struct radeon_device *rdev = dev->dev_private; struct drm_crtc *test_crtc; struct radeon_crtc *test_radeon_crtc; @@ -1751,6 +1752,10 @@ static int radeon_get_shared_dp_ppll(struct drm_crtc *crtc) test_radeon_crtc = to_radeon_crtc(test_crtc); if (test_radeon_crtc->encoder && ENCODER_MODE_IS_DP(atombios_get_encoder_mode(test_radeon_crtc->encoder))) { + /* PPLL2 is exclusive to UNIPHYA on DCE61 */ + if (ASIC_IS_DCE61(rdev) && !ASIC_IS_DCE8(rdev) && + test_radeon_crtc->pll_id == ATOM_PPLL2) + continue; /* for DP use the same PLL for all */ if (test_radeon_crtc->pll_id != ATOM_PPLL_INVALID) return test_radeon_crtc->pll_id; @@ -1772,6 +1777,7 @@ static int radeon_get_shared_nondp_ppll(struct drm_crtc *crtc) { struct radeon_crtc *radeon_crtc = to_radeon_crtc(crtc); struct drm_device *dev = crtc->dev; + struct radeon_device *rdev = dev->dev_private; struct drm_crtc *test_crtc; struct radeon_crtc *test_radeon_crtc; u32 adjusted_clock, test_adjusted_clock; @@ -1787,6 +1793,10 @@ static int radeon_get_shared_nondp_ppll(struct drm_crtc *crtc) test_radeon_crtc = to_radeon_crtc(test_crtc); if (test_radeon_crtc->encoder && !ENCODER_MODE_IS_DP(atombios_get_encoder_mode(test_radeon_crtc->encoder))) { + /* PPLL2 is exclusive to UNIPHYA on DCE61 */ + if (ASIC_IS_DCE61(rdev) && !ASIC_IS_DCE8(rdev) && + test_radeon_crtc->pll_id == ATOM_PPLL2) + continue; /* check if we are already driving this connector with another crtc */ if (test_radeon_crtc->connector == radeon_crtc->connector) { /* if we are, return that pll */ diff --git a/drivers/gpu/drm/radeon/atombios_dp.c b/drivers/gpu/drm/radeon/atombios_dp.c index afa9db1dc0e3..cead089a9e7d 100644 --- a/drivers/gpu/drm/radeon/atombios_dp.c +++ b/drivers/gpu/drm/radeon/atombios_dp.c @@ -326,8 +326,8 @@ int radeon_dp_get_dp_link_config(struct drm_connector *connector, } } } else { - for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) { - for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) { + for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) { + for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) { max_pix_clock = (lane_num * link_rates[i] * 8) / bpp; if (max_pix_clock >= pix_clock) { *dp_lanes = lane_num; diff --git a/drivers/gpu/drm/radeon/radeon_dp_auxch.c b/drivers/gpu/drm/radeon/radeon_dp_auxch.c index 3b0c229d7dcd..db64e0062689 100644 --- a/drivers/gpu/drm/radeon/radeon_dp_auxch.c +++ b/drivers/gpu/drm/radeon/radeon_dp_auxch.c @@ -105,7 +105,7 @@ radeon_dp_aux_transfer_native(struct drm_dp_aux *aux, struct drm_dp_aux_msg *msg tmp &= AUX_HPD_SEL(0x7); tmp |= AUX_HPD_SEL(chan->rec.hpd); - tmp |= AUX_EN | AUX_LS_READ_EN; + tmp |= AUX_EN | AUX_LS_READ_EN | AUX_HPD_DISCON(0x1); WREG32(AUX_CONTROL + aux_offset[instance], tmp); diff --git a/drivers/input/misc/max8997_haptic.c b/drivers/input/misc/max8997_haptic.c index a806ba3818f7..8d6326d7e7be 100644 --- a/drivers/input/misc/max8997_haptic.c +++ b/drivers/input/misc/max8997_haptic.c @@ -255,12 +255,14 @@ static int max8997_haptic_probe(struct platform_device *pdev) struct max8997_dev *iodev = dev_get_drvdata(pdev->dev.parent); const struct max8997_platform_data *pdata = dev_get_platdata(iodev->dev); - const struct max8997_haptic_platform_data *haptic_pdata = - pdata->haptic_pdata; + const struct max8997_haptic_platform_data *haptic_pdata = NULL; struct max8997_haptic *chip; struct input_dev *input_dev; int error; + if (pdata) + haptic_pdata = pdata->haptic_pdata; + if (!haptic_pdata) { dev_err(&pdev->dev, "no haptic platform data\n"); return -EINVAL; diff --git a/drivers/input/misc/twl6040-vibra.c b/drivers/input/misc/twl6040-vibra.c index df3581f60628..42de34b92996 100644 --- a/drivers/input/misc/twl6040-vibra.c +++ b/drivers/input/misc/twl6040-vibra.c @@ -257,6 +257,7 @@ static int twl6040_vibra_probe(struct platform_device *pdev) int vddvibr_uV = 0; int error; + of_node_get(twl6040_core_dev->of_node); twl6040_core_node = of_find_node_by_name(twl6040_core_dev->of_node, "vibra"); if (!twl6040_core_node) { diff --git a/drivers/input/mouse/byd.c b/drivers/input/mouse/byd.c index fdc243ca93ed..e583f8b50454 100644 --- a/drivers/input/mouse/byd.c +++ b/drivers/input/mouse/byd.c @@ -2,6 +2,10 @@ * BYD TouchPad PS/2 mouse driver * * Copyright (C) 2015 Chris Diamand <chris@diamand.org> + * Copyright (C) 2015 Richard Pospesel + * Copyright (C) 2015 Tai Chi Minh Ralph Eastwood + * Copyright (C) 2015 Martin Wimpress + * Copyright (C) 2015 Jay Kuri * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published by diff --git a/drivers/media/v4l2-core/videobuf2-v4l2.c b/drivers/media/v4l2-core/videobuf2-v4l2.c index 7f366f1b0377..0b1b8c7b6ce5 100644 --- a/drivers/media/v4l2-core/videobuf2-v4l2.c +++ b/drivers/media/v4l2-core/videobuf2-v4l2.c @@ -74,11 +74,6 @@ static int __verify_planes_array(struct vb2_buffer *vb, const struct v4l2_buffer return 0; } -static int __verify_planes_array_core(struct vb2_buffer *vb, const void *pb) -{ - return __verify_planes_array(vb, pb); -} - /** * __verify_length() - Verify that the bytesused value for each plane fits in * the plane length and that the data offset doesn't exceed the bytesused value. @@ -442,7 +437,6 @@ static int __fill_vb2_buffer(struct vb2_buffer *vb, } static const struct vb2_buf_ops v4l2_buf_ops = { - .verify_planes_array = __verify_planes_array_core, .fill_user_buffer = __fill_v4l2_buffer, .fill_vb2_buffer = __fill_vb2_buffer, .copy_timestamp = __copy_timestamp, diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c index b212488606da..11be8044e0d7 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c @@ -43,6 +43,7 @@ static void xgene_cle_idt_to_hw(u32 dstqid, u32 fpsel, static void xgene_cle_dbptr_to_hw(struct xgene_enet_pdata *pdata, struct xgene_cle_dbptr *dbptr, u32 *buf) { + buf[0] = SET_VAL(CLE_DROP, dbptr->drop); buf[4] = SET_VAL(CLE_FPSEL, dbptr->fpsel) | SET_VAL(CLE_DSTQIDL, dbptr->dstqid); @@ -412,7 +413,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata *pdata) .branch = { { /* IPV4 */ - .valid = 0, + .valid = 1, .next_packet_pointer = 22, .jump_bw = JMP_FW, .jump_rel = JMP_ABS, @@ -420,7 +421,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata *pdata) .next_node = PKT_PROT_NODE, .next_branch = 0, .data = 0x8, - .mask = 0xffff + .mask = 0x0 }, { .valid = 0, @@ -456,7 +457,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata *pdata) .next_node = RSS_IPV4_TCP_NODE, .next_branch = 0, .data = 0x0600, - .mask = 0xffff + .mask = 0x00ff }, { /* UDP */ @@ -468,7 +469,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata *pdata) .next_node = RSS_IPV4_UDP_NODE, .next_branch = 0, .data = 0x1100, - .mask = 0xffff + .mask = 0x00ff }, { .valid = 0, @@ -642,7 +643,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata *pdata) { /* TCP DST Port */ .valid = 0, - .next_packet_pointer = 256, + .next_packet_pointer = 258, .jump_bw = JMP_FW, .jump_rel = JMP_ABS, .operation = EQT, diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.h b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.h index 29a17abdd828..3bf90683240e 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.h +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.h @@ -83,6 +83,8 @@ #define CLE_TYPE_POS 0 #define CLE_TYPE_LEN 2 +#define CLE_DROP_POS 28 +#define CLE_DROP_LEN 1 #define CLE_DSTQIDL_POS 25 #define CLE_DSTQIDL_LEN 7 #define CLE_DSTQIDH_POS 0 diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c index 39e081a70f5b..513d2a62ee6d 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c @@ -219,27 +219,30 @@ void xgene_enet_parse_error(struct xgene_enet_desc_ring *ring, struct xgene_enet_pdata *pdata, enum xgene_enet_err_code status) { - struct rtnl_link_stats64 *stats = &pdata->stats; - switch (status) { case INGRESS_CRC: - stats->rx_crc_errors++; + ring->rx_crc_errors++; + ring->rx_dropped++; break; case INGRESS_CHECKSUM: case INGRESS_CHECKSUM_COMPUTE: - stats->rx_errors++; + ring->rx_errors++; + ring->rx_dropped++; break; case INGRESS_TRUNC_FRAME: - stats->rx_frame_errors++; + ring->rx_frame_errors++; + ring->rx_dropped++; break; case INGRESS_PKT_LEN: - stats->rx_length_errors++; + ring->rx_length_errors++; + ring->rx_dropped++; break; case INGRESS_PKT_UNDER: - stats->rx_frame_errors++; + ring->rx_frame_errors++; + ring->rx_dropped++; break; case INGRESS_FIFO_OVERRUN: - stats->rx_fifo_errors++; + ring->rx_fifo_errors++; break; default: break; diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h index ba7da98af2ef..45220be3122f 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h @@ -86,7 +86,7 @@ enum xgene_enet_rm { #define RINGADDRL_POS 5 #define RINGADDRL_LEN 27 #define RINGADDRH_POS 0 -#define RINGADDRH_LEN 6 +#define RINGADDRH_LEN 7 #define RINGSIZE_POS 23 #define RINGSIZE_LEN 3 #define RINGTYPE_POS 19 @@ -94,9 +94,9 @@ enum xgene_enet_rm { #define RINGMODE_POS 20 #define RINGMODE_LEN 3 #define RECOMTIMEOUTL_POS 28 -#define RECOMTIMEOUTL_LEN 3 +#define RECOMTIMEOUTL_LEN 4 #define RECOMTIMEOUTH_POS 0 -#define RECOMTIMEOUTH_LEN 2 +#define RECOMTIMEOUTH_LEN 3 #define NUMMSGSINQ_POS 1 #define NUMMSGSINQ_LEN 16 #define ACCEPTLERR BIT(19) @@ -201,6 +201,8 @@ enum xgene_enet_rm { #define USERINFO_LEN 32 #define FPQNUM_POS 32 #define FPQNUM_LEN 12 +#define ELERR_POS 46 +#define ELERR_LEN 2 #define NV_POS 50 #define NV_LEN 1 #define LL_POS 51 diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c index 99d7e580e166..fd200883d228 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c @@ -443,8 +443,8 @@ static netdev_tx_t xgene_enet_start_xmit(struct sk_buff *skb, skb_tx_timestamp(skb); - pdata->stats.tx_packets++; - pdata->stats.tx_bytes += skb->len; + tx_ring->tx_packets++; + tx_ring->tx_bytes += skb->len; pdata->ring_ops->wr_cmd(tx_ring, count); return NETDEV_TX_OK; @@ -483,12 +483,12 @@ static int xgene_enet_rx_frame(struct xgene_enet_desc_ring *rx_ring, skb = buf_pool->rx_skb[skb_index]; /* checking for error */ - status = GET_VAL(LERR, le64_to_cpu(raw_desc->m0)); + status = (GET_VAL(ELERR, le64_to_cpu(raw_desc->m0)) << LERR_LEN) || + GET_VAL(LERR, le64_to_cpu(raw_desc->m0)); if (unlikely(status > 2)) { dev_kfree_skb_any(skb); xgene_enet_parse_error(rx_ring, netdev_priv(rx_ring->ndev), status); - pdata->stats.rx_dropped++; ret = -EIO; goto out; } @@ -506,8 +506,8 @@ static int xgene_enet_rx_frame(struct xgene_enet_desc_ring *rx_ring, xgene_enet_skip_csum(skb); } - pdata->stats.rx_packets++; - pdata->stats.rx_bytes += datalen; + rx_ring->rx_packets++; + rx_ring->rx_bytes += datalen; napi_gro_receive(&rx_ring->napi, skb); out: if (--rx_ring->nbufpool == 0) { @@ -630,7 +630,7 @@ static int xgene_enet_register_irq(struct net_device *ndev) ring = pdata->rx_ring[i]; irq_set_status_flags(ring->irq, IRQ_DISABLE_UNLAZY); ret = devm_request_irq(dev, ring->irq, xgene_enet_rx_irq, - IRQF_SHARED, ring->irq_name, ring); + 0, ring->irq_name, ring); if (ret) { netdev_err(ndev, "Failed to request irq %s\n", ring->irq_name); @@ -641,7 +641,7 @@ static int xgene_enet_register_irq(struct net_device *ndev) ring = pdata->tx_ring[i]->cp_ring; irq_set_status_flags(ring->irq, IRQ_DISABLE_UNLAZY); ret = devm_request_irq(dev, ring->irq, xgene_enet_rx_irq, - IRQF_SHARED, ring->irq_name, ring); + 0, ring->irq_name, ring); if (ret) { netdev_err(ndev, "Failed to request irq %s\n", ring->irq_name); @@ -1114,12 +1114,31 @@ static struct rtnl_link_stats64 *xgene_enet_get_stats64( { struct xgene_enet_pdata *pdata = netdev_priv(ndev); struct rtnl_link_stats64 *stats = &pdata->stats; + struct xgene_enet_desc_ring *ring; + int i; - stats->rx_errors += stats->rx_length_errors + - stats->rx_crc_errors + - stats->rx_frame_errors + - stats->rx_fifo_errors; - memcpy(storage, &pdata->stats, sizeof(struct rtnl_link_stats64)); + memset(stats, 0, sizeof(struct rtnl_link_stats64)); + for (i = 0; i < pdata->txq_cnt; i++) { + ring = pdata->tx_ring[i]; + if (ring) { + stats->tx_packets += ring->tx_packets; + stats->tx_bytes += ring->tx_bytes; + } + } + + for (i = 0; i < pdata->rxq_cnt; i++) { + ring = pdata->rx_ring[i]; + if (ring) { + stats->rx_packets += ring->rx_packets; + stats->rx_bytes += ring->rx_bytes; + stats->rx_errors += ring->rx_length_errors + + ring->rx_crc_errors + + ring->rx_frame_errors + + ring->rx_fifo_errors; + stats->rx_dropped += ring->rx_dropped; + } + } + memcpy(storage, stats, sizeof(struct rtnl_link_stats64)); return storage; } @@ -1234,6 +1253,13 @@ static int xgene_enet_get_irqs(struct xgene_enet_pdata *pdata) for (i = 0; i < max_irqs; i++) { ret = platform_get_irq(pdev, i); if (ret <= 0) { + if (pdata->phy_mode == PHY_INTERFACE_MODE_XGMII) { + max_irqs = i; + pdata->rxq_cnt = max_irqs / 2; + pdata->txq_cnt = max_irqs / 2; + pdata->cq_cnt = max_irqs / 2; + break; + } dev_err(dev, "Unable to get ENET IRQ\n"); ret = ret ? : -ENXIO; return ret; @@ -1437,19 +1463,28 @@ static void xgene_enet_setup_ops(struct xgene_enet_pdata *pdata) pdata->port_ops = &xgene_xgport_ops; pdata->cle_ops = &xgene_cle3in_ops; pdata->rm = RM0; - pdata->rxq_cnt = XGENE_NUM_RX_RING; - pdata->txq_cnt = XGENE_NUM_TX_RING; - pdata->cq_cnt = XGENE_NUM_TXC_RING; + if (!pdata->rxq_cnt) { + pdata->rxq_cnt = XGENE_NUM_RX_RING; + pdata->txq_cnt = XGENE_NUM_TX_RING; + pdata->cq_cnt = XGENE_NUM_TXC_RING; + } break; } if (pdata->enet_id == XGENE_ENET1) { switch (pdata->port_id) { case 0: - pdata->cpu_bufnum = START_CPU_BUFNUM_0; - pdata->eth_bufnum = START_ETH_BUFNUM_0; - pdata->bp_bufnum = START_BP_BUFNUM_0; - pdata->ring_num = START_RING_NUM_0; + if (pdata->phy_mode == PHY_INTERFACE_MODE_XGMII) { + pdata->cpu_bufnum = X2_START_CPU_BUFNUM_0; + pdata->eth_bufnum = X2_START_ETH_BUFNUM_0; + pdata->bp_bufnum = X2_START_BP_BUFNUM_0; + pdata->ring_num = START_RING_NUM_0; + } else { + pdata->cpu_bufnum = START_CPU_BUFNUM_0; + pdata->eth_bufnum = START_ETH_BUFNUM_0; + pdata->bp_bufnum = START_BP_BUFNUM_0; + pdata->ring_num = START_RING_NUM_0; + } break; case 1: if (pdata->phy_mode == PHY_INTERFACE_MODE_XGMII) { diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.h b/drivers/net/ethernet/apm/xgene/xgene_enet_main.h index 175d18890c7a..9d9cf445148c 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.h +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.h @@ -49,10 +49,10 @@ #define XGENE_ENET_MSS 1448 #define XGENE_MIN_ENET_FRAME_SIZE 60 -#define XGENE_MAX_ENET_IRQ 8 -#define XGENE_NUM_RX_RING 4 -#define XGENE_NUM_TX_RING 4 -#define XGENE_NUM_TXC_RING 4 +#define XGENE_MAX_ENET_IRQ 16 +#define XGENE_NUM_RX_RING 8 +#define XGENE_NUM_TX_RING 8 +#define XGENE_NUM_TXC_RING 8 #define START_CPU_BUFNUM_0 0 #define START_ETH_BUFNUM_0 2 @@ -121,6 +121,16 @@ struct xgene_enet_desc_ring { struct xgene_enet_raw_desc16 *raw_desc16; }; __le64 *exp_bufs; + u64 tx_packets; + u64 tx_bytes; + u64 rx_packets; + u64 rx_bytes; + u64 rx_dropped; + u64 rx_errors; + u64 rx_length_errors; + u64 rx_crc_errors; + u64 rx_frame_errors; + u64 rx_fifo_errors; }; struct xgene_mac_ops { diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.h b/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.h index 29a71b4dcc44..002df5a6756e 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.h +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.h @@ -33,7 +33,7 @@ #define LINK_STATUS BIT(2) #define LINK_UP BIT(15) #define MPA_IDLE_WITH_QMI_EMPTY BIT(12) -#define SG_RX_DV_GATE_REG_0_ADDR 0x0dfc +#define SG_RX_DV_GATE_REG_0_ADDR 0x05fc extern const struct xgene_mac_ops xgene_sgmac_ops; extern const struct xgene_port_ops xgene_sgport_ops; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 9d4e8e113fe1..c39a7f5c6a01 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -813,6 +813,46 @@ static inline struct sk_buff *bnxt_copy_skb(struct bnxt_napi *bnapi, u8 *data, return skb; } +static int bnxt_discard_rx(struct bnxt *bp, struct bnxt_napi *bnapi, + u32 *raw_cons, void *cmp) +{ + struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring; + struct rx_cmp *rxcmp = cmp; + u32 tmp_raw_cons = *raw_cons; + u8 cmp_type, agg_bufs = 0; + + cmp_type = RX_CMP_TYPE(rxcmp); + + if (cmp_type == CMP_TYPE_RX_L2_CMP) { + agg_bufs = (le32_to_cpu(rxcmp->rx_cmp_misc_v1) & + RX_CMP_AGG_BUFS) >> + RX_CMP_AGG_BUFS_SHIFT; + } else if (cmp_type == CMP_TYPE_RX_L2_TPA_END_CMP) { + struct rx_tpa_end_cmp *tpa_end = cmp; + + agg_bufs = (le32_to_cpu(tpa_end->rx_tpa_end_cmp_misc_v1) & + RX_TPA_END_CMP_AGG_BUFS) >> + RX_TPA_END_CMP_AGG_BUFS_SHIFT; + } + + if (agg_bufs) { + if (!bnxt_agg_bufs_valid(bp, cpr, agg_bufs, &tmp_raw_cons)) + return -EBUSY; + } + *raw_cons = tmp_raw_cons; + return 0; +} + +static void bnxt_sched_reset(struct bnxt *bp, struct bnxt_rx_ring_info *rxr) +{ + if (!rxr->bnapi->in_reset) { + rxr->bnapi->in_reset = true; + set_bit(BNXT_RESET_TASK_SP_EVENT, &bp->sp_event); + schedule_work(&bp->sp_task); + } + rxr->rx_next_cons = 0xffff; +} + static void bnxt_tpa_start(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, struct rx_tpa_start_cmp *tpa_start, struct rx_tpa_start_cmp_ext *tpa_start1) @@ -830,6 +870,11 @@ static void bnxt_tpa_start(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, prod_rx_buf = &rxr->rx_buf_ring[prod]; tpa_info = &rxr->rx_tpa[agg_id]; + if (unlikely(cons != rxr->rx_next_cons)) { + bnxt_sched_reset(bp, rxr); + return; + } + prod_rx_buf->data = tpa_info->data; mapping = tpa_info->mapping; @@ -867,6 +912,7 @@ static void bnxt_tpa_start(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, rxr->rx_prod = NEXT_RX(prod); cons = NEXT_RX(cons); + rxr->rx_next_cons = NEXT_RX(cons); cons_rx_buf = &rxr->rx_buf_ring[cons]; bnxt_reuse_rx_data(rxr, cons, cons_rx_buf->data); @@ -980,6 +1026,14 @@ static inline struct sk_buff *bnxt_tpa_end(struct bnxt *bp, dma_addr_t mapping; struct sk_buff *skb; + if (unlikely(bnapi->in_reset)) { + int rc = bnxt_discard_rx(bp, bnapi, raw_cons, tpa_end); + + if (rc < 0) + return ERR_PTR(-EBUSY); + return NULL; + } + tpa_info = &rxr->rx_tpa[agg_id]; data = tpa_info->data; prefetch(data); @@ -1146,6 +1200,12 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_napi *bnapi, u32 *raw_cons, cons = rxcmp->rx_cmp_opaque; rx_buf = &rxr->rx_buf_ring[cons]; data = rx_buf->data; + if (unlikely(cons != rxr->rx_next_cons)) { + int rc1 = bnxt_discard_rx(bp, bnapi, raw_cons, rxcmp); + + bnxt_sched_reset(bp, rxr); + return rc1; + } prefetch(data); agg_bufs = (le32_to_cpu(rxcmp->rx_cmp_misc_v1) & RX_CMP_AGG_BUFS) >> @@ -1245,6 +1305,7 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_napi *bnapi, u32 *raw_cons, next_rx: rxr->rx_prod = NEXT_RX(prod); + rxr->rx_next_cons = NEXT_RX(cons); next_rx_no_prod: *raw_cons = tmp_raw_cons; @@ -2486,6 +2547,7 @@ static void bnxt_clear_ring_indices(struct bnxt *bp) rxr->rx_prod = 0; rxr->rx_agg_prod = 0; rxr->rx_sw_agg_prod = 0; + rxr->rx_next_cons = 0; } } } @@ -4462,6 +4524,7 @@ static void bnxt_enable_napi(struct bnxt *bp) int i; for (i = 0; i < bp->cp_nr_rings; i++) { + bp->bnapi[i]->in_reset = false; bnxt_enable_poll(bp->bnapi[i]); napi_enable(&bp->bnapi[i]->napi); } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 8b823ff558ff..de9d53eee3dd 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -584,6 +584,7 @@ struct bnxt_rx_ring_info { u16 rx_prod; u16 rx_agg_prod; u16 rx_sw_agg_prod; + u16 rx_next_cons; void __iomem *rx_doorbell; void __iomem *rx_agg_doorbell; @@ -636,6 +637,7 @@ struct bnxt_napi { #ifdef CONFIG_NET_RX_BUSY_POLL atomic_t poll_state; #endif + bool in_reset; }; #ifdef CONFIG_NET_RX_BUSY_POLL diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c index fa05e347262f..06b819db51b1 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c @@ -533,6 +533,7 @@ static void nicvf_rcv_queue_config(struct nicvf *nic, struct queue_set *qs, nicvf_config_vlan_stripping(nic, nic->netdev->features); /* Enable Receive queue */ + memset(&rq_cfg, 0, sizeof(struct rq_cfg)); rq_cfg.ena = 1; rq_cfg.tcp_ena = 0; nicvf_queue_reg_write(nic, NIC_QSET_RQ_0_7_CFG, qidx, *(u64 *)&rq_cfg); @@ -565,6 +566,7 @@ void nicvf_cmp_queue_config(struct nicvf *nic, struct queue_set *qs, qidx, (u64)(cq->dmem.phys_base)); /* Enable Completion queue */ + memset(&cq_cfg, 0, sizeof(struct cq_cfg)); cq_cfg.ena = 1; cq_cfg.reset = 0; cq_cfg.caching = 0; @@ -613,6 +615,7 @@ static void nicvf_snd_queue_config(struct nicvf *nic, struct queue_set *qs, qidx, (u64)(sq->dmem.phys_base)); /* Enable send queue & set queue size */ + memset(&sq_cfg, 0, sizeof(struct sq_cfg)); sq_cfg.ena = 1; sq_cfg.reset = 0; sq_cfg.ldwb = 0; @@ -649,6 +652,7 @@ static void nicvf_rbdr_config(struct nicvf *nic, struct queue_set *qs, /* Enable RBDR & set queue size */ /* Buffer size should be in multiples of 128 bytes */ + memset(&rbdr_cfg, 0, sizeof(struct rbdr_cfg)); rbdr_cfg.ena = 1; rbdr_cfg.reset = 0; rbdr_cfg.ldwb = 0; diff --git a/drivers/net/ethernet/ezchip/nps_enet.c b/drivers/net/ethernet/ezchip/nps_enet.c index 1f23845a0694..085f9125cf42 100644 --- a/drivers/net/ethernet/ezchip/nps_enet.c +++ b/drivers/net/ethernet/ezchip/nps_enet.c @@ -145,7 +145,7 @@ static void nps_enet_tx_handler(struct net_device *ndev) u32 tx_ctrl_nt = (tx_ctrl_value & TX_CTL_NT_MASK) >> TX_CTL_NT_SHIFT; /* Check if we got TX */ - if (!priv->tx_packet_sent || tx_ctrl_ct) + if (!priv->tx_skb || tx_ctrl_ct) return; /* Ack Tx ctrl register */ @@ -160,7 +160,7 @@ static void nps_enet_tx_handler(struct net_device *ndev) } dev_kfree_skb(priv->tx_skb); - priv->tx_packet_sent = false; + priv->tx_skb = NULL; if (netif_queue_stopped(ndev)) netif_wake_queue(ndev); @@ -183,6 +183,9 @@ static int nps_enet_poll(struct napi_struct *napi, int budget) work_done = nps_enet_rx_handler(ndev); if (work_done < budget) { u32 buf_int_enable_value = 0; + u32 tx_ctrl_value = nps_enet_reg_get(priv, NPS_ENET_REG_TX_CTL); + u32 tx_ctrl_ct = + (tx_ctrl_value & TX_CTL_CT_MASK) >> TX_CTL_CT_SHIFT; napi_complete(napi); @@ -192,6 +195,18 @@ static int nps_enet_poll(struct napi_struct *napi, int budget) nps_enet_reg_set(priv, NPS_ENET_REG_BUF_INT_ENABLE, buf_int_enable_value); + + /* in case we will get a tx interrupt while interrupts + * are masked, we will lose it since the tx is edge interrupt. + * specifically, while executing the code section above, + * between nps_enet_tx_handler and the interrupts enable, all + * tx requests will be stuck until we will get an rx interrupt. + * the two code lines below will solve this situation by + * re-adding ourselves to the poll list. + */ + + if (priv->tx_skb && !tx_ctrl_ct) + napi_reschedule(napi); } return work_done; @@ -217,7 +232,7 @@ static irqreturn_t nps_enet_irq_handler(s32 irq, void *dev_instance) u32 tx_ctrl_ct = (tx_ctrl_value & TX_CTL_CT_MASK) >> TX_CTL_CT_SHIFT; u32 rx_ctrl_cr = (rx_ctrl_value & RX_CTL_CR_MASK) >> RX_CTL_CR_SHIFT; - if ((!tx_ctrl_ct && priv->tx_packet_sent) || rx_ctrl_cr) + if ((!tx_ctrl_ct && priv->tx_skb) || rx_ctrl_cr) if (likely(napi_schedule_prep(&priv->napi))) { nps_enet_reg_set(priv, NPS_ENET_REG_BUF_INT_ENABLE, 0); __napi_schedule(&priv->napi); @@ -387,8 +402,6 @@ static void nps_enet_send_frame(struct net_device *ndev, /* Write the length of the Frame */ tx_ctrl_value |= length << TX_CTL_NT_SHIFT; - /* Indicate SW is done */ - priv->tx_packet_sent = true; tx_ctrl_value |= NPS_ENET_ENABLE << TX_CTL_CT_SHIFT; /* Send Frame */ nps_enet_reg_set(priv, NPS_ENET_REG_TX_CTL, tx_ctrl_value); @@ -465,7 +478,7 @@ static s32 nps_enet_open(struct net_device *ndev) s32 err; /* Reset private variables */ - priv->tx_packet_sent = false; + priv->tx_skb = NULL; priv->ge_mac_cfg_2_value = 0; priv->ge_mac_cfg_3_value = 0; @@ -534,6 +547,11 @@ static netdev_tx_t nps_enet_start_xmit(struct sk_buff *skb, priv->tx_skb = skb; + /* make sure tx_skb is actually written to the memory + * before the HW is informed and the IRQ is fired. + */ + wmb(); + nps_enet_send_frame(ndev, skb); return NETDEV_TX_OK; diff --git a/drivers/net/ethernet/ezchip/nps_enet.h b/drivers/net/ethernet/ezchip/nps_enet.h index d0cab600bce8..3939ca20cc9f 100644 --- a/drivers/net/ethernet/ezchip/nps_enet.h +++ b/drivers/net/ethernet/ezchip/nps_enet.h @@ -165,14 +165,12 @@ * struct nps_enet_priv - Storage of ENET's private information. * @regs_base: Base address of ENET memory-mapped control registers. * @irq: For RX/TX IRQ number. - * @tx_packet_sent: SW indication if frame is being sent. * @tx_skb: socket buffer of sent frame. * @napi: Structure for NAPI. */ struct nps_enet_priv { void __iomem *regs_base; s32 irq; - bool tx_packet_sent; struct sk_buff *tx_skb; struct napi_struct napi; u32 ge_mac_cfg_2_value; diff --git a/drivers/net/ethernet/marvell/Kconfig b/drivers/net/ethernet/marvell/Kconfig index b5c6d42daa12..2664827ddecd 100644 --- a/drivers/net/ethernet/marvell/Kconfig +++ b/drivers/net/ethernet/marvell/Kconfig @@ -68,7 +68,7 @@ config MVNETA config MVNETA_BM tristate - default y if MVNETA=y && MVNETA_BM_ENABLE + default y if MVNETA=y && MVNETA_BM_ENABLE!=n default MVNETA_BM_ENABLE select HWBM help diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c index cda9e604a95f..0844b7c75767 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c @@ -1417,6 +1417,7 @@ void qlcnic_83xx_get_minidump_template(struct qlcnic_adapter *adapter) struct qlcnic_fw_dump *fw_dump = &ahw->fw_dump; struct pci_dev *pdev = adapter->pdev; bool extended = false; + int ret; prev_version = adapter->fw_version; current_version = qlcnic_83xx_get_fw_version(adapter); @@ -1427,8 +1428,11 @@ void qlcnic_83xx_get_minidump_template(struct qlcnic_adapter *adapter) if (qlcnic_83xx_md_check_extended_dump_capability(adapter)) extended = !qlcnic_83xx_extend_md_capab(adapter); - if (!qlcnic_fw_cmd_get_minidump_temp(adapter)) - dev_info(&pdev->dev, "Supports FW dump capability\n"); + ret = qlcnic_fw_cmd_get_minidump_temp(adapter); + if (ret) + return; + + dev_info(&pdev->dev, "Supports FW dump capability\n"); /* Once we have minidump template with extended iSCSI dump * capability, update the minidump capture mask to 0x1f as diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 9e2a0bd8f5a8..4277d0c12101 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -1506,6 +1506,8 @@ static int ravb_close(struct net_device *ndev) priv->phydev = NULL; } + if (priv->chip_id == RCAR_GEN3) + free_irq(priv->emac_irq, ndev); free_irq(ndev->irq, ndev); napi_disable(&priv->napi[RAVB_NC]); diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 5590b9c182c9..445fc5aef308 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -790,9 +790,11 @@ void phy_start(struct phy_device *phydev) break; case PHY_HALTED: /* make sure interrupts are re-enabled for the PHY */ - err = phy_enable_interrupts(phydev); - if (err < 0) - break; + if (phydev->irq != PHY_POLL) { + err = phy_enable_interrupts(phydev); + if (err < 0) + break; + } phydev->state = PHY_RESUMING; do_resume = true; diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c index 75870e68a7c3..34731e29c589 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c @@ -105,6 +105,7 @@ void iwl_mvm_set_tx_cmd(struct iwl_mvm *mvm, struct sk_buff *skb, struct iwl_tx_cmd *tx_cmd, struct ieee80211_tx_info *info, u8 sta_id) { + struct ieee80211_tx_info *skb_info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (void *)skb->data; __le16 fc = hdr->frame_control; u32 tx_flags = le32_to_cpu(tx_cmd->tx_flags); @@ -185,7 +186,7 @@ void iwl_mvm_set_tx_cmd(struct iwl_mvm *mvm, struct sk_buff *skb, tx_cmd->tx_flags = cpu_to_le32(tx_flags); /* Total # bytes to be transmitted */ tx_cmd->len = cpu_to_le16((u16)skb->len + - (uintptr_t)info->driver_data[0]); + (uintptr_t)skb_info->driver_data[0]); tx_cmd->next_frame_len = 0; tx_cmd->life_time = cpu_to_le32(TX_CMD_LIFE_TIME_INFINITE); tx_cmd->sta_id = sta_id; @@ -327,10 +328,11 @@ static void iwl_mvm_set_tx_cmd_crypto(struct iwl_mvm *mvm, */ static struct iwl_device_cmd * iwl_mvm_set_tx_params(struct iwl_mvm *mvm, struct sk_buff *skb, - int hdrlen, struct ieee80211_sta *sta, u8 sta_id) + struct ieee80211_tx_info *info, int hdrlen, + struct ieee80211_sta *sta, u8 sta_id) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; - struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_tx_info *skb_info = IEEE80211_SKB_CB(skb); struct iwl_device_cmd *dev_cmd; struct iwl_tx_cmd *tx_cmd; @@ -350,10 +352,10 @@ iwl_mvm_set_tx_params(struct iwl_mvm *mvm, struct sk_buff *skb, iwl_mvm_set_tx_cmd_rate(mvm, tx_cmd, info, sta, hdr->frame_control); - memset(&info->status, 0, sizeof(info->status)); - memset(info->driver_data, 0, sizeof(info->driver_data)); + memset(&skb_info->status, 0, sizeof(skb_info->status)); + memset(skb_info->driver_data, 0, sizeof(skb_info->driver_data)); - info->driver_data[1] = dev_cmd; + skb_info->driver_data[1] = dev_cmd; return dev_cmd; } @@ -361,22 +363,25 @@ iwl_mvm_set_tx_params(struct iwl_mvm *mvm, struct sk_buff *skb, int iwl_mvm_tx_skb_non_sta(struct iwl_mvm *mvm, struct sk_buff *skb) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; - struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_tx_info *skb_info = IEEE80211_SKB_CB(skb); + struct ieee80211_tx_info info; struct iwl_device_cmd *dev_cmd; struct iwl_tx_cmd *tx_cmd; u8 sta_id; int hdrlen = ieee80211_hdrlen(hdr->frame_control); - if (WARN_ON_ONCE(info->flags & IEEE80211_TX_CTL_AMPDU)) + memcpy(&info, skb->cb, sizeof(info)); + + if (WARN_ON_ONCE(info.flags & IEEE80211_TX_CTL_AMPDU)) return -1; - if (WARN_ON_ONCE(info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM && - (!info->control.vif || - info->hw_queue != info->control.vif->cab_queue))) + if (WARN_ON_ONCE(info.flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM && + (!info.control.vif || + info.hw_queue != info.control.vif->cab_queue))) return -1; /* This holds the amsdu headers length */ - info->driver_data[0] = (void *)(uintptr_t)0; + skb_info->driver_data[0] = (void *)(uintptr_t)0; /* * IWL_MVM_OFFCHANNEL_QUEUE is used for ROC packets that can be used @@ -385,7 +390,7 @@ int iwl_mvm_tx_skb_non_sta(struct iwl_mvm *mvm, struct sk_buff *skb) * and hence needs to be sent on the aux queue */ if (IEEE80211_SKB_CB(skb)->hw_queue == IWL_MVM_OFFCHANNEL_QUEUE && - info->control.vif->type == NL80211_IFTYPE_STATION) + info.control.vif->type == NL80211_IFTYPE_STATION) IEEE80211_SKB_CB(skb)->hw_queue = mvm->aux_queue; /* @@ -398,14 +403,14 @@ int iwl_mvm_tx_skb_non_sta(struct iwl_mvm *mvm, struct sk_buff *skb) * AUX station. */ sta_id = mvm->aux_sta.sta_id; - if (info->control.vif) { + if (info.control.vif) { struct iwl_mvm_vif *mvmvif = - iwl_mvm_vif_from_mac80211(info->control.vif); + iwl_mvm_vif_from_mac80211(info.control.vif); - if (info->control.vif->type == NL80211_IFTYPE_P2P_DEVICE || - info->control.vif->type == NL80211_IFTYPE_AP) + if (info.control.vif->type == NL80211_IFTYPE_P2P_DEVICE || + info.control.vif->type == NL80211_IFTYPE_AP) sta_id = mvmvif->bcast_sta.sta_id; - else if (info->control.vif->type == NL80211_IFTYPE_STATION && + else if (info.control.vif->type == NL80211_IFTYPE_STATION && is_multicast_ether_addr(hdr->addr1)) { u8 ap_sta_id = ACCESS_ONCE(mvmvif->ap_sta_id); @@ -414,19 +419,18 @@ int iwl_mvm_tx_skb_non_sta(struct iwl_mvm *mvm, struct sk_buff *skb) } } - IWL_DEBUG_TX(mvm, "station Id %d, queue=%d\n", sta_id, info->hw_queue); + IWL_DEBUG_TX(mvm, "station Id %d, queue=%d\n", sta_id, info.hw_queue); - dev_cmd = iwl_mvm_set_tx_params(mvm, skb, hdrlen, NULL, sta_id); + dev_cmd = iwl_mvm_set_tx_params(mvm, skb, &info, hdrlen, NULL, sta_id); if (!dev_cmd) return -1; - /* From now on, we cannot access info->control */ tx_cmd = (struct iwl_tx_cmd *)dev_cmd->payload; /* Copy MAC header from skb into command buffer */ memcpy(tx_cmd->hdr, hdr, hdrlen); - if (iwl_trans_tx(mvm->trans, skb, dev_cmd, info->hw_queue)) { + if (iwl_trans_tx(mvm->trans, skb, dev_cmd, info.hw_queue)) { iwl_trans_free_tx_cmd(mvm->trans, dev_cmd); return -1; } @@ -445,11 +449,11 @@ int iwl_mvm_tx_skb_non_sta(struct iwl_mvm *mvm, struct sk_buff *skb) #ifdef CONFIG_INET static int iwl_mvm_tx_tso(struct iwl_mvm *mvm, struct sk_buff *skb, + struct ieee80211_tx_info *info, struct ieee80211_sta *sta, struct sk_buff_head *mpdus_skb) { struct iwl_mvm_sta *mvmsta = iwl_mvm_sta_from_mac80211(sta); - struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (void *)skb->data; unsigned int mss = skb_shinfo(skb)->gso_size; struct sk_buff *tmp, *next; @@ -544,6 +548,8 @@ static int iwl_mvm_tx_tso(struct iwl_mvm *mvm, struct sk_buff *skb, /* This skb fits in one single A-MSDU */ if (num_subframes * mss >= tcp_payload_len) { + struct ieee80211_tx_info *skb_info = IEEE80211_SKB_CB(skb); + /* * Compute the length of all the data added for the A-MSDU. * This will be used to compute the length to write in the TX @@ -552,11 +558,10 @@ static int iwl_mvm_tx_tso(struct iwl_mvm *mvm, struct sk_buff *skb, * already had one set of SNAP / IP / TCP headers. */ num_subframes = DIV_ROUND_UP(tcp_payload_len, mss); - info = IEEE80211_SKB_CB(skb); amsdu_add = num_subframes * sizeof(struct ethhdr) + (num_subframes - 1) * (snap_ip_tcp + pad); /* This holds the amsdu headers length */ - info->driver_data[0] = (void *)(uintptr_t)amsdu_add; + skb_info->driver_data[0] = (void *)(uintptr_t)amsdu_add; __skb_queue_tail(mpdus_skb, skb); return 0; @@ -596,11 +601,14 @@ segment: ip_hdr(tmp)->id = htons(ip_base_id + i * num_subframes); if (tcp_payload_len > mss) { + struct ieee80211_tx_info *skb_info = + IEEE80211_SKB_CB(tmp); + num_subframes = DIV_ROUND_UP(tcp_payload_len, mss); - info = IEEE80211_SKB_CB(tmp); amsdu_add = num_subframes * sizeof(struct ethhdr) + (num_subframes - 1) * (snap_ip_tcp + pad); - info->driver_data[0] = (void *)(uintptr_t)amsdu_add; + skb_info->driver_data[0] = + (void *)(uintptr_t)amsdu_add; skb_shinfo(tmp)->gso_size = mss; } else { qc = ieee80211_get_qos_ctl((void *)tmp->data); @@ -622,6 +630,7 @@ segment: } #else /* CONFIG_INET */ static int iwl_mvm_tx_tso(struct iwl_mvm *mvm, struct sk_buff *skb, + struct ieee80211_tx_info *info, struct ieee80211_sta *sta, struct sk_buff_head *mpdus_skb) { @@ -636,10 +645,10 @@ static int iwl_mvm_tx_tso(struct iwl_mvm *mvm, struct sk_buff *skb, * Sets the fields in the Tx cmd that are crypto related */ static int iwl_mvm_tx_mpdu(struct iwl_mvm *mvm, struct sk_buff *skb, + struct ieee80211_tx_info *info, struct ieee80211_sta *sta) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; - struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct iwl_mvm_sta *mvmsta; struct iwl_device_cmd *dev_cmd; struct iwl_tx_cmd *tx_cmd; @@ -660,7 +669,8 @@ static int iwl_mvm_tx_mpdu(struct iwl_mvm *mvm, struct sk_buff *skb, if (WARN_ON_ONCE(mvmsta->sta_id == IWL_MVM_STATION_COUNT)) return -1; - dev_cmd = iwl_mvm_set_tx_params(mvm, skb, hdrlen, sta, mvmsta->sta_id); + dev_cmd = iwl_mvm_set_tx_params(mvm, skb, info, hdrlen, + sta, mvmsta->sta_id); if (!dev_cmd) goto drop; @@ -736,7 +746,8 @@ int iwl_mvm_tx_skb(struct iwl_mvm *mvm, struct sk_buff *skb, struct ieee80211_sta *sta) { struct iwl_mvm_sta *mvmsta = iwl_mvm_sta_from_mac80211(sta); - struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_tx_info *skb_info = IEEE80211_SKB_CB(skb); + struct ieee80211_tx_info info; struct sk_buff_head mpdus_skbs; unsigned int payload_len; int ret; @@ -747,21 +758,23 @@ int iwl_mvm_tx_skb(struct iwl_mvm *mvm, struct sk_buff *skb, if (WARN_ON_ONCE(mvmsta->sta_id == IWL_MVM_STATION_COUNT)) return -1; + memcpy(&info, skb->cb, sizeof(info)); + /* This holds the amsdu headers length */ - info->driver_data[0] = (void *)(uintptr_t)0; + skb_info->driver_data[0] = (void *)(uintptr_t)0; if (!skb_is_gso(skb)) - return iwl_mvm_tx_mpdu(mvm, skb, sta); + return iwl_mvm_tx_mpdu(mvm, skb, &info, sta); payload_len = skb_tail_pointer(skb) - skb_transport_header(skb) - tcp_hdrlen(skb) + skb->data_len; if (payload_len <= skb_shinfo(skb)->gso_size) - return iwl_mvm_tx_mpdu(mvm, skb, sta); + return iwl_mvm_tx_mpdu(mvm, skb, &info, sta); __skb_queue_head_init(&mpdus_skbs); - ret = iwl_mvm_tx_tso(mvm, skb, sta, &mpdus_skbs); + ret = iwl_mvm_tx_tso(mvm, skb, &info, sta, &mpdus_skbs); if (ret) return ret; @@ -771,7 +784,7 @@ int iwl_mvm_tx_skb(struct iwl_mvm *mvm, struct sk_buff *skb, while (!skb_queue_empty(&mpdus_skbs)) { skb = __skb_dequeue(&mpdus_skbs); - ret = iwl_mvm_tx_mpdu(mvm, skb, sta); + ret = iwl_mvm_tx_mpdu(mvm, skb, &info, sta); if (ret) { __skb_queue_purge(&mpdus_skbs); return ret; diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index b42f26029225..4412a57ec862 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -711,6 +711,7 @@ static void xenvif_tx_err(struct xenvif_queue *queue, if (cons == end) break; RING_COPY_REQUEST(&queue->tx, cons++, txp); + extra_count = 0; /* only the first frag can have extras */ } while (1); queue->tx.req_cons = cons; } diff --git a/drivers/pinctrl/pinctrl-at91-pio4.c b/drivers/pinctrl/pinctrl-at91-pio4.c index 4429312e848d..2c447130b954 100644 --- a/drivers/pinctrl/pinctrl-at91-pio4.c +++ b/drivers/pinctrl/pinctrl-at91-pio4.c @@ -722,9 +722,11 @@ static int atmel_conf_pin_config_group_set(struct pinctrl_dev *pctldev, break; case PIN_CONFIG_BIAS_PULL_UP: conf |= ATMEL_PIO_PUEN_MASK; + conf &= (~ATMEL_PIO_PDEN_MASK); break; case PIN_CONFIG_BIAS_PULL_DOWN: conf |= ATMEL_PIO_PDEN_MASK; + conf &= (~ATMEL_PIO_PUEN_MASK); break; case PIN_CONFIG_DRIVE_OPEN_DRAIN: if (arg == 0) diff --git a/drivers/regulator/axp20x-regulator.c b/drivers/regulator/axp20x-regulator.c index 40cd894e4df5..514a5e8fdbab 100644 --- a/drivers/regulator/axp20x-regulator.c +++ b/drivers/regulator/axp20x-regulator.c @@ -157,7 +157,9 @@ static struct regulator_ops axp20x_ops_sw = { static const struct regulator_linear_range axp20x_ldo4_ranges[] = { REGULATOR_LINEAR_RANGE(1250000, 0x0, 0x0, 0), REGULATOR_LINEAR_RANGE(1300000, 0x1, 0x8, 100000), - REGULATOR_LINEAR_RANGE(2500000, 0x9, 0xf, 100000), + REGULATOR_LINEAR_RANGE(2500000, 0x9, 0x9, 0), + REGULATOR_LINEAR_RANGE(2700000, 0xa, 0xb, 100000), + REGULATOR_LINEAR_RANGE(3000000, 0xc, 0xf, 100000), }; static const struct regulator_desc axp20x_regulators[] = { @@ -215,10 +217,14 @@ static const struct regulator_desc axp22x_regulators[] = { AXP22X_ELDO2_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL2, BIT(1)), AXP_DESC(AXP22X, ELDO3, "eldo3", "eldoin", 700, 3300, 100, AXP22X_ELDO3_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL2, BIT(2)), - AXP_DESC_IO(AXP22X, LDO_IO0, "ldo_io0", "ips", 1800, 3300, 100, + /* Note the datasheet only guarantees reliable operation up to + * 3.3V, this needs to be enforced via dts provided constraints */ + AXP_DESC_IO(AXP22X, LDO_IO0, "ldo_io0", "ips", 700, 3800, 100, AXP22X_LDO_IO0_V_OUT, 0x1f, AXP20X_GPIO0_CTRL, 0x07, AXP22X_IO_ENABLED, AXP22X_IO_DISABLED), - AXP_DESC_IO(AXP22X, LDO_IO1, "ldo_io1", "ips", 1800, 3300, 100, + /* Note the datasheet only guarantees reliable operation up to + * 3.3V, this needs to be enforced via dts provided constraints */ + AXP_DESC_IO(AXP22X, LDO_IO1, "ldo_io1", "ips", 700, 3800, 100, AXP22X_LDO_IO1_V_OUT, 0x1f, AXP20X_GPIO1_CTRL, 0x07, AXP22X_IO_ENABLED, AXP22X_IO_DISABLED), AXP_DESC_FIXED(AXP22X, RTC_LDO, "rtc_ldo", "ips", 3000), diff --git a/drivers/regulator/da9063-regulator.c b/drivers/regulator/da9063-regulator.c index ed9e7e96f877..c6af343f54ea 100644 --- a/drivers/regulator/da9063-regulator.c +++ b/drivers/regulator/da9063-regulator.c @@ -900,4 +900,4 @@ module_exit(da9063_regulator_cleanup); MODULE_AUTHOR("Krystian Garbaciak <krystian.garbaciak@diasemi.com>"); MODULE_DESCRIPTION("DA9063 regulators driver"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("paltform:" DA9063_DRVNAME_REGULATORS); +MODULE_ALIAS("platform:" DA9063_DRVNAME_REGULATORS); diff --git a/drivers/regulator/gpio-regulator.c b/drivers/regulator/gpio-regulator.c index a8718e98674a..83e89e5d4752 100644 --- a/drivers/regulator/gpio-regulator.c +++ b/drivers/regulator/gpio-regulator.c @@ -162,6 +162,8 @@ of_get_gpio_regulator_config(struct device *dev, struct device_node *np, of_property_read_u32(np, "startup-delay-us", &config->startup_delay); config->enable_gpio = of_get_named_gpio(np, "enable-gpio", 0); + if (config->enable_gpio == -EPROBE_DEFER) + return ERR_PTR(-EPROBE_DEFER); /* Fetch GPIOs. - optional property*/ ret = of_gpio_count(np); diff --git a/drivers/regulator/s2mps11.c b/drivers/regulator/s2mps11.c index d24e2c783dc5..6dfa3502e1f1 100644 --- a/drivers/regulator/s2mps11.c +++ b/drivers/regulator/s2mps11.c @@ -308,7 +308,7 @@ static struct regulator_ops s2mps11_buck_ops = { .enable_mask = S2MPS11_ENABLE_MASK \ } -#define regulator_desc_s2mps11_buck6_10(num, min, step) { \ +#define regulator_desc_s2mps11_buck67810(num, min, step) { \ .name = "BUCK"#num, \ .id = S2MPS11_BUCK##num, \ .ops = &s2mps11_buck_ops, \ @@ -324,6 +324,22 @@ static struct regulator_ops s2mps11_buck_ops = { .enable_mask = S2MPS11_ENABLE_MASK \ } +#define regulator_desc_s2mps11_buck9 { \ + .name = "BUCK9", \ + .id = S2MPS11_BUCK9, \ + .ops = &s2mps11_buck_ops, \ + .type = REGULATOR_VOLTAGE, \ + .owner = THIS_MODULE, \ + .min_uV = MIN_3000_MV, \ + .uV_step = STEP_25_MV, \ + .n_voltages = S2MPS11_BUCK9_N_VOLTAGES, \ + .ramp_delay = S2MPS11_RAMP_DELAY, \ + .vsel_reg = S2MPS11_REG_B9CTRL2, \ + .vsel_mask = S2MPS11_BUCK9_VSEL_MASK, \ + .enable_reg = S2MPS11_REG_B9CTRL1, \ + .enable_mask = S2MPS11_ENABLE_MASK \ +} + static const struct regulator_desc s2mps11_regulators[] = { regulator_desc_s2mps11_ldo(1, STEP_25_MV), regulator_desc_s2mps11_ldo(2, STEP_50_MV), @@ -368,11 +384,11 @@ static const struct regulator_desc s2mps11_regulators[] = { regulator_desc_s2mps11_buck1_4(3), regulator_desc_s2mps11_buck1_4(4), regulator_desc_s2mps11_buck5, - regulator_desc_s2mps11_buck6_10(6, MIN_600_MV, STEP_6_25_MV), - regulator_desc_s2mps11_buck6_10(7, MIN_600_MV, STEP_6_25_MV), - regulator_desc_s2mps11_buck6_10(8, MIN_600_MV, STEP_6_25_MV), - regulator_desc_s2mps11_buck6_10(9, MIN_3000_MV, STEP_25_MV), - regulator_desc_s2mps11_buck6_10(10, MIN_750_MV, STEP_12_5_MV), + regulator_desc_s2mps11_buck67810(6, MIN_600_MV, STEP_6_25_MV), + regulator_desc_s2mps11_buck67810(7, MIN_600_MV, STEP_6_25_MV), + regulator_desc_s2mps11_buck67810(8, MIN_600_MV, STEP_6_25_MV), + regulator_desc_s2mps11_buck9, + regulator_desc_s2mps11_buck67810(10, MIN_750_MV, STEP_12_5_MV), }; static struct regulator_ops s2mps14_reg_ops; diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c index 8eaed0522aa3..a655cf29c16f 100644 --- a/drivers/scsi/device_handler/scsi_dh_alua.c +++ b/drivers/scsi/device_handler/scsi_dh_alua.c @@ -532,6 +532,7 @@ static int alua_rtpg(struct scsi_device *sdev, struct alua_port_group *pg) return SCSI_DH_DEV_TEMP_BUSY; retry: + err = 0; retval = submit_rtpg(sdev, buff, bufflen, &sense_hdr, pg->flags); if (retval) { diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c index 5d0ec42a9317..634254a52301 100644 --- a/drivers/scsi/qla1280.c +++ b/drivers/scsi/qla1280.c @@ -4214,7 +4214,7 @@ static struct scsi_host_template qla1280_driver_template = { .eh_bus_reset_handler = qla1280_eh_bus_reset, .eh_host_reset_handler = qla1280_eh_adapter_reset, .bios_param = qla1280_biosparam, - .can_queue = 0xfffff, + .can_queue = MAX_OUTSTANDING_COMMANDS, .this_id = -1, .sg_tablesize = SG_ALL, .use_clustering = ENABLE_CLUSTERING, diff --git a/drivers/spi/spi-fsl-dspi.c b/drivers/spi/spi-fsl-dspi.c index 39412c9097c6..c1a2d747b246 100644 --- a/drivers/spi/spi-fsl-dspi.c +++ b/drivers/spi/spi-fsl-dspi.c @@ -385,8 +385,8 @@ static int dspi_transfer_one_message(struct spi_master *master, dspi->cur_chip = spi_get_ctldata(spi); dspi->cs = spi->chip_select; dspi->cs_change = 0; - if (dspi->cur_transfer->transfer_list.next - == &dspi->cur_msg->transfers) + if (list_is_last(&dspi->cur_transfer->transfer_list, + &dspi->cur_msg->transfers) || transfer->cs_change) dspi->cs_change = 1; dspi->void_write_data = dspi->cur_chip->void_write_data; diff --git a/drivers/spi/spi-omap2-mcspi.c b/drivers/spi/spi-omap2-mcspi.c index 43a02e377b3b..0caa3c8bef46 100644 --- a/drivers/spi/spi-omap2-mcspi.c +++ b/drivers/spi/spi-omap2-mcspi.c @@ -423,12 +423,16 @@ static void omap2_mcspi_tx_dma(struct spi_device *spi, if (mcspi_dma->dma_tx) { struct dma_async_tx_descriptor *tx; + struct scatterlist sg; dmaengine_slave_config(mcspi_dma->dma_tx, &cfg); - tx = dmaengine_prep_slave_sg(mcspi_dma->dma_tx, xfer->tx_sg.sgl, - xfer->tx_sg.nents, DMA_MEM_TO_DEV, - DMA_PREP_INTERRUPT | DMA_CTRL_ACK); + sg_init_table(&sg, 1); + sg_dma_address(&sg) = xfer->tx_dma; + sg_dma_len(&sg) = xfer->len; + + tx = dmaengine_prep_slave_sg(mcspi_dma->dma_tx, &sg, 1, + DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); if (tx) { tx->callback = omap2_mcspi_tx_callback; tx->callback_param = spi; @@ -474,15 +478,20 @@ omap2_mcspi_rx_dma(struct spi_device *spi, struct spi_transfer *xfer, if (mcspi_dma->dma_rx) { struct dma_async_tx_descriptor *tx; + struct scatterlist sg; dmaengine_slave_config(mcspi_dma->dma_rx, &cfg); if ((l & OMAP2_MCSPI_CHCONF_TURBO) && mcspi->fifo_depth == 0) dma_count -= es; - tx = dmaengine_prep_slave_sg(mcspi_dma->dma_rx, xfer->rx_sg.sgl, - xfer->rx_sg.nents, DMA_DEV_TO_MEM, - DMA_PREP_INTERRUPT | DMA_CTRL_ACK); + sg_init_table(&sg, 1); + sg_dma_address(&sg) = xfer->rx_dma; + sg_dma_len(&sg) = dma_count; + + tx = dmaengine_prep_slave_sg(mcspi_dma->dma_rx, &sg, 1, + DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT | + DMA_CTRL_ACK); if (tx) { tx->callback = omap2_mcspi_rx_callback; tx->callback_param = spi; @@ -496,6 +505,8 @@ omap2_mcspi_rx_dma(struct spi_device *spi, struct spi_transfer *xfer, omap2_mcspi_set_dma_req(spi, 1, 1); wait_for_completion(&mcspi_dma->dma_rx_completion); + dma_unmap_single(mcspi->dev, xfer->rx_dma, count, + DMA_FROM_DEVICE); if (mcspi->fifo_depth > 0) return count; @@ -608,6 +619,8 @@ omap2_mcspi_txrx_dma(struct spi_device *spi, struct spi_transfer *xfer) if (tx != NULL) { wait_for_completion(&mcspi_dma->dma_tx_completion); + dma_unmap_single(mcspi->dev, xfer->tx_dma, xfer->len, + DMA_TO_DEVICE); if (mcspi->fifo_depth > 0) { irqstat_reg = mcspi->base + OMAP2_MCSPI_IRQSTATUS; @@ -1074,16 +1087,6 @@ static void omap2_mcspi_cleanup(struct spi_device *spi) gpio_free(spi->cs_gpio); } -static bool omap2_mcspi_can_dma(struct spi_master *master, - struct spi_device *spi, - struct spi_transfer *xfer) -{ - if (xfer->len < DMA_MIN_BYTES) - return false; - - return true; -} - static int omap2_mcspi_work_one(struct omap2_mcspi *mcspi, struct spi_device *spi, struct spi_transfer *t) { @@ -1265,6 +1268,32 @@ static int omap2_mcspi_transfer_one(struct spi_master *master, return -EINVAL; } + if (len < DMA_MIN_BYTES) + goto skip_dma_map; + + if (mcspi_dma->dma_tx && tx_buf != NULL) { + t->tx_dma = dma_map_single(mcspi->dev, (void *) tx_buf, + len, DMA_TO_DEVICE); + if (dma_mapping_error(mcspi->dev, t->tx_dma)) { + dev_dbg(mcspi->dev, "dma %cX %d bytes error\n", + 'T', len); + return -EINVAL; + } + } + if (mcspi_dma->dma_rx && rx_buf != NULL) { + t->rx_dma = dma_map_single(mcspi->dev, rx_buf, t->len, + DMA_FROM_DEVICE); + if (dma_mapping_error(mcspi->dev, t->rx_dma)) { + dev_dbg(mcspi->dev, "dma %cX %d bytes error\n", + 'R', len); + if (tx_buf != NULL) + dma_unmap_single(mcspi->dev, t->tx_dma, + len, DMA_TO_DEVICE); + return -EINVAL; + } + } + +skip_dma_map: return omap2_mcspi_work_one(mcspi, spi, t); } @@ -1348,7 +1377,6 @@ static int omap2_mcspi_probe(struct platform_device *pdev) master->transfer_one = omap2_mcspi_transfer_one; master->set_cs = omap2_mcspi_set_cs; master->cleanup = omap2_mcspi_cleanup; - master->can_dma = omap2_mcspi_can_dma; master->dev.of_node = node; master->max_speed_hz = OMAP2_MCSPI_MAX_FREQ; master->min_speed_hz = OMAP2_MCSPI_MAX_FREQ >> 15; diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c index 85e59a406a4c..86138e4101b0 100644 --- a/drivers/spi/spi-pxa2xx.c +++ b/drivers/spi/spi-pxa2xx.c @@ -126,7 +126,7 @@ static const struct lpss_config lpss_platforms[] = { .reg_general = -1, .reg_ssp = 0x20, .reg_cs_ctrl = 0x24, - .reg_capabilities = 0xfc, + .reg_capabilities = -1, .rx_threshold = 1, .tx_threshold_lo = 32, .tx_threshold_hi = 56, diff --git a/drivers/spi/spi-ti-qspi.c b/drivers/spi/spi-ti-qspi.c index eac3c960b2de..443f664534e1 100644 --- a/drivers/spi/spi-ti-qspi.c +++ b/drivers/spi/spi-ti-qspi.c @@ -94,6 +94,7 @@ struct ti_qspi { #define QSPI_FLEN(n) ((n - 1) << 0) #define QSPI_WLEN_MAX_BITS 128 #define QSPI_WLEN_MAX_BYTES 16 +#define QSPI_WLEN_MASK QSPI_WLEN(QSPI_WLEN_MAX_BITS) /* STATUS REGISTER */ #define BUSY 0x01 @@ -235,16 +236,16 @@ static inline int ti_qspi_poll_wc(struct ti_qspi *qspi) return -ETIMEDOUT; } -static int qspi_write_msg(struct ti_qspi *qspi, struct spi_transfer *t) +static int qspi_write_msg(struct ti_qspi *qspi, struct spi_transfer *t, + int count) { - int wlen, count, xfer_len; + int wlen, xfer_len; unsigned int cmd; const u8 *txbuf; u32 data; txbuf = t->tx_buf; cmd = qspi->cmd | QSPI_WR_SNGL; - count = t->len; wlen = t->bits_per_word >> 3; /* in bytes */ xfer_len = wlen; @@ -304,9 +305,10 @@ static int qspi_write_msg(struct ti_qspi *qspi, struct spi_transfer *t) return 0; } -static int qspi_read_msg(struct ti_qspi *qspi, struct spi_transfer *t) +static int qspi_read_msg(struct ti_qspi *qspi, struct spi_transfer *t, + int count) { - int wlen, count; + int wlen; unsigned int cmd; u8 *rxbuf; @@ -323,7 +325,6 @@ static int qspi_read_msg(struct ti_qspi *qspi, struct spi_transfer *t) cmd |= QSPI_RD_SNGL; break; } - count = t->len; wlen = t->bits_per_word >> 3; /* in bytes */ while (count) { @@ -354,12 +355,13 @@ static int qspi_read_msg(struct ti_qspi *qspi, struct spi_transfer *t) return 0; } -static int qspi_transfer_msg(struct ti_qspi *qspi, struct spi_transfer *t) +static int qspi_transfer_msg(struct ti_qspi *qspi, struct spi_transfer *t, + int count) { int ret; if (t->tx_buf) { - ret = qspi_write_msg(qspi, t); + ret = qspi_write_msg(qspi, t, count); if (ret) { dev_dbg(qspi->dev, "Error while writing\n"); return ret; @@ -367,7 +369,7 @@ static int qspi_transfer_msg(struct ti_qspi *qspi, struct spi_transfer *t) } if (t->rx_buf) { - ret = qspi_read_msg(qspi, t); + ret = qspi_read_msg(qspi, t, count); if (ret) { dev_dbg(qspi->dev, "Error while reading\n"); return ret; @@ -450,7 +452,8 @@ static int ti_qspi_start_transfer_one(struct spi_master *master, struct spi_device *spi = m->spi; struct spi_transfer *t; int status = 0, ret; - int frame_length; + unsigned int frame_len_words, transfer_len_words; + int wlen; /* setup device control reg */ qspi->dc = 0; @@ -462,14 +465,15 @@ static int ti_qspi_start_transfer_one(struct spi_master *master, if (spi->mode & SPI_CS_HIGH) qspi->dc |= QSPI_CSPOL(spi->chip_select); - frame_length = (m->frame_length << 3) / spi->bits_per_word; - - frame_length = clamp(frame_length, 0, QSPI_FRAME); + frame_len_words = 0; + list_for_each_entry(t, &m->transfers, transfer_list) + frame_len_words += t->len / (t->bits_per_word >> 3); + frame_len_words = min_t(unsigned int, frame_len_words, QSPI_FRAME); /* setup command reg */ qspi->cmd = 0; qspi->cmd |= QSPI_EN_CS(spi->chip_select); - qspi->cmd |= QSPI_FLEN(frame_length); + qspi->cmd |= QSPI_FLEN(frame_len_words); ti_qspi_write(qspi, qspi->dc, QSPI_SPI_DC_REG); @@ -479,16 +483,23 @@ static int ti_qspi_start_transfer_one(struct spi_master *master, ti_qspi_disable_memory_map(spi); list_for_each_entry(t, &m->transfers, transfer_list) { - qspi->cmd |= QSPI_WLEN(t->bits_per_word); + qspi->cmd = ((qspi->cmd & ~QSPI_WLEN_MASK) | + QSPI_WLEN(t->bits_per_word)); + + wlen = t->bits_per_word >> 3; + transfer_len_words = min(t->len / wlen, frame_len_words); - ret = qspi_transfer_msg(qspi, t); + ret = qspi_transfer_msg(qspi, t, transfer_len_words * wlen); if (ret) { dev_dbg(qspi->dev, "transfer message failed\n"); mutex_unlock(&qspi->list_lock); return -EINVAL; } - m->actual_length += t->len; + m->actual_length += transfer_len_words * wlen; + frame_len_words -= transfer_len_words; + if (frame_len_words == 0) + break; } mutex_unlock(&qspi->list_lock); diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig index 983280e8d93f..e5a391aecde1 100644 --- a/drivers/video/fbdev/Kconfig +++ b/drivers/video/fbdev/Kconfig @@ -761,7 +761,7 @@ config FB_VESA config FB_EFI bool "EFI-based Framebuffer Support" - depends on (FB = y) && X86 && EFI + depends on (FB = y) && !IA64 && EFI select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT diff --git a/drivers/video/fbdev/efifb.c b/drivers/video/fbdev/efifb.c index 95d293b7445a..f4c045c0051c 100644 --- a/drivers/video/fbdev/efifb.c +++ b/drivers/video/fbdev/efifb.c @@ -6,16 +6,14 @@ * */ -#include <linux/module.h> #include <linux/kernel.h> +#include <linux/efi.h> #include <linux/errno.h> #include <linux/fb.h> #include <linux/platform_device.h> #include <linux/screen_info.h> -#include <linux/dmi.h> -#include <linux/pci.h> #include <video/vga.h> -#include <asm/sysfb.h> +#include <asm/efi.h> static bool request_mem_succeeded = false; @@ -85,21 +83,13 @@ static struct fb_ops efifb_ops = { static int efifb_setup(char *options) { char *this_opt; - int i; if (options && *options) { while ((this_opt = strsep(&options, ",")) != NULL) { if (!*this_opt) continue; - for (i = 0; i < M_UNKNOWN; i++) { - if (efifb_dmi_list[i].base != 0 && - !strcmp(this_opt, efifb_dmi_list[i].optname)) { - screen_info.lfb_base = efifb_dmi_list[i].base; - screen_info.lfb_linelength = efifb_dmi_list[i].stride; - screen_info.lfb_width = efifb_dmi_list[i].width; - screen_info.lfb_height = efifb_dmi_list[i].height; - } - } + efifb_setup_from_dmi(&screen_info, this_opt); + if (!strncmp(this_opt, "base:", 5)) screen_info.lfb_base = simple_strtoul(this_opt+5, NULL, 0); else if (!strncmp(this_opt, "stride:", 7)) @@ -338,5 +328,4 @@ static struct platform_driver efifb_driver = { .remove = efifb_remove, }; -module_platform_driver(efifb_driver); -MODULE_LICENSE("GPL"); +builtin_platform_driver(efifb_driver); diff --git a/drivers/xen/efi.c b/drivers/xen/efi.c index be7e56a338e8..e9d2135445c1 100644 --- a/drivers/xen/efi.c +++ b/drivers/xen/efi.c @@ -316,7 +316,6 @@ static const struct efi efi_xen __initconst = { .get_next_high_mono_count = xen_efi_get_next_high_mono_count, .reset_system = NULL, /* Functionality provided by Xen. */ .set_virtual_address_map = NULL, /* Not used under Xen. */ - .memmap = NULL, /* Not used under Xen. */ .flags = 0 /* Initialized later. */ }; diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index feef8a9c4de7..f02404052b7b 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -112,7 +112,6 @@ static int ecryptfs_readdir(struct file *file, struct dir_context *ctx) .sb = inode->i_sb, }; lower_file = ecryptfs_file_to_lower(file); - lower_file->f_pos = ctx->pos; rc = iterate_dir(lower_file, &buf.ctx); ctx->pos = buf.ctx.pos; if (rc < 0) @@ -223,14 +222,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file) } ecryptfs_set_file_lower( file, ecryptfs_inode_to_private(inode)->lower_file); - if (d_is_dir(ecryptfs_dentry)) { - ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); - mutex_lock(&crypt_stat->cs_mutex); - crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); - mutex_unlock(&crypt_stat->cs_mutex); - rc = 0; - goto out; - } rc = read_or_initialize_metadata(ecryptfs_dentry); if (rc) goto out_put; @@ -247,6 +238,45 @@ out: return rc; } +/** + * ecryptfs_dir_open + * @inode: inode speciying file to open + * @file: Structure to return filled in + * + * Opens the file specified by inode. + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_dir_open(struct inode *inode, struct file *file) +{ + struct dentry *ecryptfs_dentry = file->f_path.dentry; + /* Private value of ecryptfs_dentry allocated in + * ecryptfs_lookup() */ + struct ecryptfs_file_info *file_info; + struct file *lower_file; + + /* Released in ecryptfs_release or end of function if failure */ + file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL); + ecryptfs_set_file_private(file, file_info); + if (unlikely(!file_info)) { + ecryptfs_printk(KERN_ERR, + "Error attempting to allocate memory\n"); + return -ENOMEM; + } + lower_file = dentry_open(ecryptfs_dentry_to_lower_path(ecryptfs_dentry), + file->f_flags, current_cred()); + if (IS_ERR(lower_file)) { + printk(KERN_ERR "%s: Error attempting to initialize " + "the lower file for the dentry with name " + "[%pd]; rc = [%ld]\n", __func__, + ecryptfs_dentry, PTR_ERR(lower_file)); + kmem_cache_free(ecryptfs_file_info_cache, file_info); + return PTR_ERR(lower_file); + } + ecryptfs_set_file_lower(file, lower_file); + return 0; +} + static int ecryptfs_flush(struct file *file, fl_owner_t td) { struct file *lower_file = ecryptfs_file_to_lower(file); @@ -267,6 +297,19 @@ static int ecryptfs_release(struct inode *inode, struct file *file) return 0; } +static int ecryptfs_dir_release(struct inode *inode, struct file *file) +{ + fput(ecryptfs_file_to_lower(file)); + kmem_cache_free(ecryptfs_file_info_cache, + ecryptfs_file_to_private(file)); + return 0; +} + +static loff_t ecryptfs_dir_llseek(struct file *file, loff_t offset, int whence) +{ + return vfs_llseek(ecryptfs_file_to_lower(file), offset, whence); +} + static int ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { @@ -346,20 +389,16 @@ const struct file_operations ecryptfs_dir_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, #endif - .open = ecryptfs_open, - .flush = ecryptfs_flush, - .release = ecryptfs_release, + .open = ecryptfs_dir_open, + .release = ecryptfs_dir_release, .fsync = ecryptfs_fsync, - .fasync = ecryptfs_fasync, - .splice_read = generic_file_splice_read, - .llseek = default_llseek, + .llseek = ecryptfs_dir_llseek, }; const struct file_operations ecryptfs_main_fops = { .llseek = generic_file_llseek, .read_iter = ecryptfs_read_update_atime, .write_iter = generic_file_write_iter, - .iterate = ecryptfs_readdir, .unlocked_ioctl = ecryptfs_unlocked_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index d48e0d261d78..5f22e74bbade 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -157,7 +157,7 @@ efivarfs_ioc_setxflags(struct file *file, void __user *arg) return 0; } -long +static long efivarfs_file_ioctl(struct file *file, unsigned int cmd, unsigned long p) { void __user *arg = (void __user *)p; diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 553c5d2db4a4..9cb54a38832d 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -216,8 +216,7 @@ static int efivarfs_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&efivarfs_list); - err = efivar_init(efivarfs_callback, (void *)sb, false, - true, &efivarfs_list); + err = efivar_init(efivarfs_callback, (void *)sb, true, &efivarfs_list); if (err) __efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL); diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index 5384ceb35b1c..98b3eb7d8eaf 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -203,6 +203,8 @@ int get_rock_ridge_filename(struct iso_directory_record *de, int retnamlen = 0; int truncate = 0; int ret = 0; + char *p; + int len; if (!ISOFS_SB(inode->i_sb)->s_rock) return 0; @@ -267,12 +269,17 @@ repeat: rr->u.NM.flags); break; } - if ((strlen(retname) + rr->len - 5) >= 254) { + len = rr->len - 5; + if (retnamlen + len >= 254) { truncate = 1; break; } - strncat(retname, rr->u.NM.name, rr->len - 5); - retnamlen += rr->len - 5; + p = memchr(rr->u.NM.name, '\0', len); + if (unlikely(p)) + len = p - rr->u.NM.name; + memcpy(retname + retnamlen, rr->u.NM.name, len); + retnamlen += len; + retname[retnamlen] = '\0'; break; case SIG('R', 'E'): kfree(rs.buffer); diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 03b688d19f69..37f9678ae4df 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -153,9 +153,9 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, p = buf + len + nlen; *p = '\0'; for (kn = kn_to; kn != common; kn = kn->parent) { - nlen = strlen(kn->name); - p -= nlen; - memcpy(p, kn->name, nlen); + size_t tmp = strlen(kn->name); + p -= tmp; + memcpy(p, kn->name, tmp); *(--p) = '/'; } diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index f73541fbe7af..3d670a3678f2 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -15,6 +15,7 @@ #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/namei.h> +#include <linux/seq_file.h> #include "kernfs-internal.h" @@ -40,6 +41,19 @@ static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry) return 0; } +static int kernfs_sop_show_path(struct seq_file *sf, struct dentry *dentry) +{ + struct kernfs_node *node = dentry->d_fsdata; + struct kernfs_root *root = kernfs_root(node); + struct kernfs_syscall_ops *scops = root->syscall_ops; + + if (scops && scops->show_path) + return scops->show_path(sf, node, root); + + seq_dentry(sf, dentry, " \t\n\\"); + return 0; +} + const struct super_operations kernfs_sops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, @@ -47,6 +61,7 @@ const struct super_operations kernfs_sops = { .remount_fs = kernfs_sop_remount_fs, .show_options = kernfs_sop_show_options, + .show_path = kernfs_sop_show_path, }; /** diff --git a/fs/namei.c b/fs/namei.c index 1d9ca2d5dff6..42f8ca038254 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1794,30 +1794,49 @@ static inline unsigned int fold_hash(unsigned long hash) return hash_64(hash, 32); } +/* + * This is George Marsaglia's XORSHIFT generator. + * It implements a maximum-period LFSR in only a few + * instructions. It also has the property (required + * by hash_name()) that mix_hash(0) = 0. + */ +static inline unsigned long mix_hash(unsigned long hash) +{ + hash ^= hash << 13; + hash ^= hash >> 7; + hash ^= hash << 17; + return hash; +} + #else /* 32-bit case */ #define fold_hash(x) (x) +static inline unsigned long mix_hash(unsigned long hash) +{ + hash ^= hash << 13; + hash ^= hash >> 17; + hash ^= hash << 5; + return hash; +} + #endif unsigned int full_name_hash(const unsigned char *name, unsigned int len) { - unsigned long a, mask; - unsigned long hash = 0; + unsigned long a, hash = 0; for (;;) { a = load_unaligned_zeropad(name); if (len < sizeof(unsigned long)) break; - hash += a; - hash *= 9; + hash = mix_hash(hash + a); name += sizeof(unsigned long); len -= sizeof(unsigned long); if (!len) goto done; } - mask = bytemask_from_count(len); - hash += mask & a; + hash += a & bytemask_from_count(len); done: return fold_hash(hash); } @@ -1835,7 +1854,7 @@ static inline u64 hash_name(const char *name) hash = a = 0; len = -sizeof(unsigned long); do { - hash = (hash + a) * 9; + hash = mix_hash(hash + a); len += sizeof(unsigned long); a = load_unaligned_zeropad(name+len); b = a ^ REPEAT_BYTE('/'); @@ -2267,6 +2286,33 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, EXPORT_SYMBOL(vfs_path_lookup); /** + * lookup_hash - lookup single pathname component on already hashed name + * @name: name and hash to lookup + * @base: base directory to lookup from + * + * The name must have been verified and hashed (see lookup_one_len()). Using + * this after just full_name_hash() is unsafe. + * + * This function also doesn't check for search permission on base directory. + * + * Use lookup_one_len_unlocked() instead, unless you really know what you are + * doing. + * + * Do not hold i_mutex; this helper takes i_mutex if necessary. + */ +struct dentry *lookup_hash(const struct qstr *name, struct dentry *base) +{ + struct dentry *ret; + + ret = lookup_dcache(name, base, 0); + if (!ret) + ret = lookup_slow(name, base, 0); + + return ret; +} +EXPORT_SYMBOL(lookup_hash); + +/** * lookup_one_len - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from @@ -2337,7 +2383,6 @@ struct dentry *lookup_one_len_unlocked(const char *name, struct qstr this; unsigned int c; int err; - struct dentry *ret; this.name = name; this.len = len; @@ -2369,10 +2414,7 @@ struct dentry *lookup_one_len_unlocked(const char *name, if (err) return ERR_PTR(err); - ret = lookup_dcache(&this, base, 0); - if (!ret) - ret = lookup_slow(&this, base, 0); - return ret; + return lookup_hash(&this, base); } EXPORT_SYMBOL(lookup_one_len_unlocked); @@ -2942,22 +2984,10 @@ no_open: dentry = lookup_real(dir, dentry, nd->flags); if (IS_ERR(dentry)) return PTR_ERR(dentry); - - if (create_error) { - int open_flag = op->open_flag; - - error = create_error; - if ((open_flag & O_EXCL)) { - if (!dentry->d_inode) - goto out; - } else if (!dentry->d_inode) { - goto out; - } else if ((open_flag & O_TRUNC) && - d_is_reg(dentry)) { - goto out; - } - /* will fail later, go on to get the right error */ - } + } + if (create_error && !dentry->d_inode) { + error = create_error; + goto out; } looked_up: path->dentry = dentry; @@ -4213,7 +4243,11 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, bool new_is_dir = false; unsigned max_links = new_dir->i_sb->s_max_links; - if (source == target) + /* + * Check source == target. + * On overlayfs need to look at underlying inodes. + */ + if (vfs_select_inode(old_dentry, 0) == vfs_select_inode(new_dentry, 0)) return 0; error = may_delete(old_dir, old_dentry, is_dir); diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 0cdf497c91ef..2162434728c0 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -322,3 +322,90 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) brelse(di_bh); return acl; } + +int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct posix_acl *acl; + int ret; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return 0; + + acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh); + if (IS_ERR(acl) || !acl) + return PTR_ERR(acl); + ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (ret) + return ret; + ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS, + acl, NULL, NULL); + posix_acl_release(acl); + return ret; +} + +/* + * Initialize the ACLs of a new inode. If parent directory has default ACL, + * then clone to new inode. Called from ocfs2_mknod. + */ +int ocfs2_init_acl(handle_t *handle, + struct inode *inode, + struct inode *dir, + struct buffer_head *di_bh, + struct buffer_head *dir_bh, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct posix_acl *acl = NULL; + int ret = 0, ret2; + umode_t mode; + + if (!S_ISLNK(inode->i_mode)) { + if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { + acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT, + dir_bh); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + if (!acl) { + mode = inode->i_mode & ~current_umask(); + ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode); + if (ret) { + mlog_errno(ret); + goto cleanup; + } + } + } + if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { + if (S_ISDIR(inode->i_mode)) { + ret = ocfs2_set_acl(handle, inode, di_bh, + ACL_TYPE_DEFAULT, acl, + meta_ac, data_ac); + if (ret) + goto cleanup; + } + mode = inode->i_mode; + ret = __posix_acl_create(&acl, GFP_NOFS, &mode); + if (ret < 0) + return ret; + + ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode); + if (ret2) { + mlog_errno(ret2); + ret = ret2; + goto cleanup; + } + if (ret > 0) { + ret = ocfs2_set_acl(handle, inode, + di_bh, ACL_TYPE_ACCESS, + acl, meta_ac, data_ac); + } + } +cleanup: + posix_acl_release(acl); + return ret; +} diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 3fce68d08625..2783a75b3999 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -35,5 +35,10 @@ int ocfs2_set_acl(handle_t *handle, struct posix_acl *acl, struct ocfs2_alloc_context *meta_ac, struct ocfs2_alloc_context *data_ac); +extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *); +extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, + struct buffer_head *, struct buffer_head *, + struct ocfs2_alloc_context *, + struct ocfs2_alloc_context *); #endif /* OCFS2_ACL_H */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 5308841756be..59cce53c91d8 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1268,20 +1268,20 @@ bail_unlock_rw: if (size_change) ocfs2_rw_unlock(inode, 1); bail: - brelse(bh); /* Release quota pointers in case we acquired them */ for (qtype = 0; qtype < OCFS2_MAXQUOTAS; qtype++) dqput(transfer_to[qtype]); if (!status && attr->ia_valid & ATTR_MODE) { - status = posix_acl_chmod(inode, inode->i_mode); + status = ocfs2_acl_chmod(inode, bh); if (status < 0) mlog_errno(status); } if (inode_locked) ocfs2_inode_unlock(inode, 1); + brelse(bh); return status; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 6b3e87189a64..a8f1225e6d9b 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -259,7 +259,6 @@ static int ocfs2_mknod(struct inode *dir, struct ocfs2_dir_lookup_result lookup = { NULL, }; sigset_t oldset; int did_block_signals = 0; - struct posix_acl *default_acl = NULL, *acl = NULL; struct ocfs2_dentry_lock *dl = NULL; trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name, @@ -367,12 +366,6 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } - status = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); - if (status) { - mlog_errno(status); - goto leave; - } - handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, S_ISDIR(mode), xattr_credits)); @@ -421,16 +414,8 @@ static int ocfs2_mknod(struct inode *dir, inc_nlink(dir); } - if (default_acl) { - status = ocfs2_set_acl(handle, inode, new_fe_bh, - ACL_TYPE_DEFAULT, default_acl, - meta_ac, data_ac); - } - if (!status && acl) { - status = ocfs2_set_acl(handle, inode, new_fe_bh, - ACL_TYPE_ACCESS, acl, - meta_ac, data_ac); - } + status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh, + meta_ac, data_ac); if (status < 0) { mlog_errno(status); @@ -472,10 +457,6 @@ static int ocfs2_mknod(struct inode *dir, d_instantiate(dentry, inode); status = 0; leave: - if (default_acl) - posix_acl_release(default_acl); - if (acl) - posix_acl_release(acl); if (status < 0 && did_quota_inode) dquot_free_inode(inode); if (handle) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 744d5d90c363..92bbe93bfe10 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4248,20 +4248,12 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, struct inode *inode = d_inode(old_dentry); struct buffer_head *old_bh = NULL; struct inode *new_orphan_inode = NULL; - struct posix_acl *default_acl, *acl; - umode_t mode; if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) return -EOPNOTSUPP; - mode = inode->i_mode; - error = posix_acl_create(dir, &mode, &default_acl, &acl); - if (error) { - mlog_errno(error); - return error; - } - error = ocfs2_create_inode_in_orphan(dir, mode, + error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, &new_orphan_inode); if (error) { mlog_errno(error); @@ -4300,16 +4292,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, /* If the security isn't preserved, we need to re-initialize them. */ if (!preserve) { error = ocfs2_init_security_and_acl(dir, new_orphan_inode, - &new_dentry->d_name, - default_acl, acl); + &new_dentry->d_name); if (error) mlog_errno(error); } out: - if (default_acl) - posix_acl_release(default_acl); - if (acl) - posix_acl_release(acl); if (!error) { error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, new_dentry); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 7d3d979f57d9..f19b7381a998 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7216,12 +7216,10 @@ out: */ int ocfs2_init_security_and_acl(struct inode *dir, struct inode *inode, - const struct qstr *qstr, - struct posix_acl *default_acl, - struct posix_acl *acl) + const struct qstr *qstr) { - struct buffer_head *dir_bh = NULL; int ret = 0; + struct buffer_head *dir_bh = NULL; ret = ocfs2_init_security_get(inode, dir, qstr, NULL); if (ret) { @@ -7234,11 +7232,9 @@ int ocfs2_init_security_and_acl(struct inode *dir, mlog_errno(ret); goto leave; } - - if (!ret && default_acl) - ret = ocfs2_iop_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); - if (!ret && acl) - ret = ocfs2_iop_set_acl(inode, acl, ACL_TYPE_ACCESS); + ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); + if (ret) + mlog_errno(ret); ocfs2_inode_unlock(dir, 0); brelse(dir_bh); diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index f10d5b93c366..1633cc15ea1f 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -94,7 +94,5 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, bool preserve_security); int ocfs2_init_security_and_acl(struct inode *dir, struct inode *inode, - const struct qstr *qstr, - struct posix_acl *default_acl, - struct posix_acl *acl); + const struct qstr *qstr); #endif /* OCFS2_XATTR_H */ diff --git a/fs/open.c b/fs/open.c index 17cb6b1dab75..081d3d6df74b 100644 --- a/fs/open.c +++ b/fs/open.c @@ -840,16 +840,12 @@ EXPORT_SYMBOL(file_path); int vfs_open(const struct path *path, struct file *file, const struct cred *cred) { - struct dentry *dentry = path->dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = vfs_select_inode(path->dentry, file->f_flags); - file->f_path = *path; - if (dentry->d_flags & DCACHE_OP_SELECT_INODE) { - inode = dentry->d_op->d_select_inode(dentry, file->f_flags); - if (IS_ERR(inode)) - return PTR_ERR(inode); - } + if (IS_ERR(inode)) + return PTR_ERR(inode); + file->f_path = *path; return do_dentry_open(file, inode, NULL, cred); } diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 5d972e6cd3fe..791235e03d17 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -411,9 +411,7 @@ static inline struct dentry *ovl_lookup_real(struct dentry *dir, { struct dentry *dentry; - inode_lock(dir->d_inode); - dentry = lookup_one_len(name->name, dir, name->len); - inode_unlock(dir->d_inode); + dentry = lookup_hash(name, dir); if (IS_ERR(dentry)) { if (PTR_ERR(dentry) == -ENOENT) diff --git a/fs/splice.c b/fs/splice.c index b018eb485019..dd9bf7e410d2 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1143,6 +1143,9 @@ static long do_splice_to(struct file *in, loff_t *ppos, if (unlikely(ret < 0)) return ret; + if (unlikely(len > MAX_RW_COUNT)) + len = MAX_RW_COUNT; + if (in->f_op->splice_read) splice_read = in->f_op->splice_read; else diff --git a/include/asm-generic/rwsem.h b/include/asm-generic/rwsem.h index d6d5dc98d7da..3fc94a046bf5 100644 --- a/include/asm-generic/rwsem.h +++ b/include/asm-generic/rwsem.h @@ -53,7 +53,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) +static inline void __down_write(struct rw_semaphore *sem) { long tmp; @@ -63,9 +63,16 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) rwsem_down_write_failed(sem); } -static inline void __down_write(struct rw_semaphore *sem) +static inline int __down_write_killable(struct rw_semaphore *sem) { - __down_write_nested(sem, 0); + long tmp; + + tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, + (atomic_long_t *)&sem->count); + if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) + if (IS_ERR(rwsem_down_write_failed_killable(sem))) + return -EINTR; + return 0; } static inline int __down_write_trylock(struct rw_semaphore *sem) diff --git a/include/linux/atomic.h b/include/linux/atomic.h index 506c3531832e..e451534fe54d 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -560,11 +560,11 @@ static inline int atomic_dec_if_positive(atomic_t *v) /** * atomic_fetch_or - perform *p |= mask and return old value of *p - * @p: pointer to atomic_t * @mask: mask to OR on the atomic_t + * @p: pointer to atomic_t */ #ifndef atomic_fetch_or -static inline int atomic_fetch_or(atomic_t *p, int mask) +static inline int atomic_fetch_or(int mask, atomic_t *p) { int old, val = atomic_read(p); diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 4bb4de8d95ea..7e9422cb5989 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -565,4 +565,16 @@ static inline struct dentry *d_real(struct dentry *dentry) return dentry; } +static inline struct inode *vfs_select_inode(struct dentry *dentry, + unsigned open_flags) +{ + struct inode *inode = d_inode(dentry); + + if (inode && unlikely(dentry->d_flags & DCACHE_OP_SELECT_INODE)) + inode = dentry->d_op->d_select_inode(dentry, open_flags); + + return inode; +} + + #endif /* __LINUX_DCACHE_H */ diff --git a/include/linux/efi.h b/include/linux/efi.h index 1626474567ac..df7acb51f3cc 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -21,6 +21,7 @@ #include <linux/pfn.h> #include <linux/pstore.h> #include <linux/reboot.h> +#include <linux/screen_info.h> #include <asm/page.h> @@ -124,6 +125,13 @@ typedef struct { } efi_capsule_header_t; /* + * EFI capsule flags + */ +#define EFI_CAPSULE_PERSIST_ACROSS_RESET 0x00010000 +#define EFI_CAPSULE_POPULATE_SYSTEM_TABLE 0x00020000 +#define EFI_CAPSULE_INITIATE_RESET 0x00040000 + +/* * Allocation types for calls to boottime->allocate_pages. */ #define EFI_ALLOCATE_ANY_PAGES 0 @@ -282,9 +290,10 @@ typedef struct { efi_status_t (*handle_protocol)(efi_handle_t, efi_guid_t *, void **); void *__reserved; void *register_protocol_notify; - void *locate_handle; + efi_status_t (*locate_handle)(int, efi_guid_t *, void *, + unsigned long *, efi_handle_t *); void *locate_device_path; - void *install_configuration_table; + efi_status_t (*install_configuration_table)(efi_guid_t *, void *); void *load_image; void *start_image; void *exit; @@ -623,6 +632,27 @@ void efi_native_runtime_setup(void); EFI_GUID(0x3152bca5, 0xeade, 0x433d, \ 0x86, 0x2e, 0xc0, 0x1c, 0xdc, 0x29, 0x1f, 0x44) +#define EFI_MEMORY_ATTRIBUTES_TABLE_GUID \ + EFI_GUID(0xdcfa911d, 0x26eb, 0x469f, \ + 0xa2, 0x20, 0x38, 0xb7, 0xdc, 0x46, 0x12, 0x20) + +#define EFI_CONSOLE_OUT_DEVICE_GUID \ + EFI_GUID(0xd3b36f2c, 0xd551, 0x11d4, \ + 0x9a, 0x46, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d) + +/* + * This GUID is used to pass to the kernel proper the struct screen_info + * structure that was populated by the stub based on the GOP protocol instance + * associated with ConOut + */ +#define LINUX_EFI_ARM_SCREEN_INFO_TABLE_GUID \ + EFI_GUID(0xe03fc20a, 0x85dc, 0x406e, \ + 0xb9, 0xe, 0x4a, 0xb5, 0x02, 0x37, 0x1d, 0x95) + +#define LINUX_EFI_LOADER_ENTRY_GUID \ + EFI_GUID(0x4a67b082, 0x0a4c, 0x41cf, \ + 0xb6, 0xc7, 0x44, 0x0b, 0x29, 0xbb, 0x8c, 0x4f) + typedef struct { efi_guid_t guid; u64 table; @@ -847,6 +877,14 @@ typedef struct { #define EFI_INVALID_TABLE_ADDR (~0UL) +typedef struct { + u32 version; + u32 num_entries; + u32 desc_size; + u32 reserved; + efi_memory_desc_t entry[0]; +} efi_memory_attributes_table_t; + /* * All runtime access to EFI goes through this structure: */ @@ -868,6 +906,7 @@ extern struct efi { unsigned long config_table; /* config tables */ unsigned long esrt; /* ESRT table */ unsigned long properties_table; /* properties table */ + unsigned long mem_attr_table; /* memory attributes table */ efi_get_time_t *get_time; efi_set_time_t *set_time; efi_get_wakeup_time_t *get_wakeup_time; @@ -883,7 +922,7 @@ extern struct efi { efi_get_next_high_mono_count_t *get_next_high_mono_count; efi_reset_system_t *reset_system; efi_set_virtual_address_map_t *set_virtual_address_map; - struct efi_memory_map *memmap; + struct efi_memory_map memmap; unsigned long flags; } efi; @@ -945,7 +984,6 @@ extern void efi_initialize_iomem_resources(struct resource *code_resource, extern void efi_get_time(struct timespec *now); extern void efi_reserve_boot_services(void); extern int efi_get_fdt_params(struct efi_fdt_params *params); -extern struct efi_memory_map memmap; extern struct kobject *efi_kobj; extern int efi_reboot_quirk_mode; @@ -957,12 +995,34 @@ extern void __init efi_fake_memmap(void); static inline void efi_fake_memmap(void) { } #endif +/* + * efi_memattr_perm_setter - arch specific callback function passed into + * efi_memattr_apply_permissions() that updates the + * mapping permissions described by the second + * argument in the page tables referred to by the + * first argument. + */ +typedef int (*efi_memattr_perm_setter)(struct mm_struct *, efi_memory_desc_t *); + +extern int efi_memattr_init(void); +extern int efi_memattr_apply_permissions(struct mm_struct *mm, + efi_memattr_perm_setter fn); + /* Iterate through an efi_memory_map */ -#define for_each_efi_memory_desc(m, md) \ +#define for_each_efi_memory_desc_in_map(m, md) \ for ((md) = (m)->map; \ (md) <= (efi_memory_desc_t *)((m)->map_end - (m)->desc_size); \ (md) = (void *)(md) + (m)->desc_size) +/** + * for_each_efi_memory_desc - iterate over descriptors in efi.memmap + * @md: the efi_memory_desc_t * iterator + * + * Once the loop finishes @md must not be accessed. + */ +#define for_each_efi_memory_desc(md) \ + for_each_efi_memory_desc_in_map(&efi.memmap, md) + /* * Format an EFI memory descriptor's type and attributes to a user-provided * character buffer, as per snprintf(), and return the buffer. @@ -1000,7 +1060,6 @@ extern int __init efi_setup_pcdp_console(char *); * possible, remove EFI-related code altogether. */ #define EFI_BOOT 0 /* Were we booted from EFI? */ -#define EFI_SYSTEM_TABLES 1 /* Can we use EFI system tables? */ #define EFI_CONFIG_TABLES 2 /* Can we use EFI config tables? */ #define EFI_RUNTIME_SERVICES 3 /* Can we use runtime services? */ #define EFI_MEMMAP 4 /* Can we use EFI memory map? */ @@ -1026,8 +1085,16 @@ static inline bool efi_enabled(int feature) } static inline void efi_reboot(enum reboot_mode reboot_mode, const char *__unused) {} + +static inline bool +efi_capsule_pending(int *reset_type) +{ + return false; +} #endif +extern int efi_status_to_err(efi_status_t status); + /* * Variable Attributes */ @@ -1180,6 +1247,80 @@ struct efi_simple_text_output_protocol { void *test_string; }; +#define PIXEL_RGB_RESERVED_8BIT_PER_COLOR 0 +#define PIXEL_BGR_RESERVED_8BIT_PER_COLOR 1 +#define PIXEL_BIT_MASK 2 +#define PIXEL_BLT_ONLY 3 +#define PIXEL_FORMAT_MAX 4 + +struct efi_pixel_bitmask { + u32 red_mask; + u32 green_mask; + u32 blue_mask; + u32 reserved_mask; +}; + +struct efi_graphics_output_mode_info { + u32 version; + u32 horizontal_resolution; + u32 vertical_resolution; + int pixel_format; + struct efi_pixel_bitmask pixel_information; + u32 pixels_per_scan_line; +} __packed; + +struct efi_graphics_output_protocol_mode_32 { + u32 max_mode; + u32 mode; + u32 info; + u32 size_of_info; + u64 frame_buffer_base; + u32 frame_buffer_size; +} __packed; + +struct efi_graphics_output_protocol_mode_64 { + u32 max_mode; + u32 mode; + u64 info; + u64 size_of_info; + u64 frame_buffer_base; + u64 frame_buffer_size; +} __packed; + +struct efi_graphics_output_protocol_mode { + u32 max_mode; + u32 mode; + unsigned long info; + unsigned long size_of_info; + u64 frame_buffer_base; + unsigned long frame_buffer_size; +} __packed; + +struct efi_graphics_output_protocol_32 { + u32 query_mode; + u32 set_mode; + u32 blt; + u32 mode; +}; + +struct efi_graphics_output_protocol_64 { + u64 query_mode; + u64 set_mode; + u64 blt; + u64 mode; +}; + +struct efi_graphics_output_protocol { + unsigned long query_mode; + unsigned long set_mode; + unsigned long blt; + struct efi_graphics_output_protocol_mode *mode; +}; + +typedef efi_status_t (*efi_graphics_output_protocol_query_mode)( + struct efi_graphics_output_protocol *, u32, unsigned long *, + struct efi_graphics_output_mode_info **); + extern struct list_head efivar_sysfs_list; static inline void @@ -1195,8 +1336,7 @@ int efivars_unregister(struct efivars *efivars); struct kobject *efivars_kobject(void); int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *), - void *data, bool atomic, bool duplicates, - struct list_head *head); + void *data, bool duplicates, struct list_head *head); void efivar_entry_add(struct efivar_entry *entry, struct list_head *head); void efivar_entry_remove(struct efivar_entry *entry); @@ -1242,6 +1382,13 @@ int efivars_sysfs_init(void); #define EFIVARS_DATA_SIZE_MAX 1024 #endif /* CONFIG_EFI_VARS */ +extern bool efi_capsule_pending(int *reset_type); + +extern int efi_capsule_supported(efi_guid_t guid, u32 flags, + size_t size, int *reset); + +extern int efi_capsule_update(efi_capsule_header_t *capsule, + struct page **pages); #ifdef CONFIG_EFI_RUNTIME_MAP int efi_runtime_map_init(struct kobject *); @@ -1319,5 +1466,9 @@ efi_status_t handle_cmdline_files(efi_system_table_t *sys_table_arg, efi_status_t efi_parse_options(char *cmdline); +efi_status_t efi_setup_gop(efi_system_table_t *sys_table_arg, + struct screen_info *si, efi_guid_t *proto, + unsigned long size); + bool efi_runtime_disabled(void); #endif /* _LINUX_EFI_H */ diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index c06c44242f39..30f089ebe0a4 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -152,6 +152,8 @@ struct kernfs_syscall_ops { int (*rmdir)(struct kernfs_node *kn); int (*rename)(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name); + int (*show_path)(struct seq_file *sf, struct kernfs_node *kn, + struct kernfs_root *root); }; struct kernfs_root { diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index d10ef06971b5..f75222ea7f16 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -446,6 +446,18 @@ do { \ lock_acquired(&(_lock)->dep_map, _RET_IP_); \ } while (0) +#define LOCK_CONTENDED_RETURN(_lock, try, lock) \ +({ \ + int ____err = 0; \ + if (!try(_lock)) { \ + lock_contended(&(_lock)->dep_map, _RET_IP_); \ + ____err = lock(_lock); \ + } \ + if (!____err) \ + lock_acquired(&(_lock)->dep_map, _RET_IP_); \ + ____err; \ +}) + #else /* CONFIG_LOCK_STAT */ #define lock_contended(lockdep_map, ip) do {} while (0) @@ -454,6 +466,9 @@ do { \ #define LOCK_CONTENDED(_lock, try, lock) \ lock(_lock) +#define LOCK_CONTENDED_RETURN(_lock, try, lock) \ + lock(_lock) + #endif /* CONFIG_LOCK_STAT */ #ifdef CONFIG_LOCKDEP diff --git a/include/linux/mfd/samsung/s2mps11.h b/include/linux/mfd/samsung/s2mps11.h index b288965e8101..2c14eeca46f0 100644 --- a/include/linux/mfd/samsung/s2mps11.h +++ b/include/linux/mfd/samsung/s2mps11.h @@ -173,10 +173,12 @@ enum s2mps11_regulators { #define S2MPS11_LDO_VSEL_MASK 0x3F #define S2MPS11_BUCK_VSEL_MASK 0xFF +#define S2MPS11_BUCK9_VSEL_MASK 0x1F #define S2MPS11_ENABLE_MASK (0x03 << S2MPS11_ENABLE_SHIFT) #define S2MPS11_ENABLE_SHIFT 0x06 #define S2MPS11_LDO_N_VOLTAGES (S2MPS11_LDO_VSEL_MASK + 1) #define S2MPS11_BUCK_N_VOLTAGES (S2MPS11_BUCK_VSEL_MASK + 1) +#define S2MPS11_BUCK9_N_VOLTAGES (S2MPS11_BUCK9_VSEL_MASK + 1) #define S2MPS11_RAMP_DELAY 25000 /* uV/us */ #define S2MPS11_CTRL1_PWRHOLD_MASK BIT(4) diff --git a/include/linux/mm.h b/include/linux/mm.h index 864d7221de84..8f468e0d2534 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -500,11 +500,20 @@ static inline int page_mapcount(struct page *page) #ifdef CONFIG_TRANSPARENT_HUGEPAGE int total_mapcount(struct page *page); +int page_trans_huge_mapcount(struct page *page, int *total_mapcount); #else static inline int total_mapcount(struct page *page) { return page_mapcount(page); } +static inline int page_trans_huge_mapcount(struct page *page, + int *total_mapcount) +{ + int mapcount = page_mapcount(page); + if (total_mapcount) + *total_mapcount = mapcount; + return mapcount; +} #endif static inline struct page *virt_to_head_page(const void *x) diff --git a/include/linux/namei.h b/include/linux/namei.h index 77d01700daf7..ec5ec2818a28 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -79,6 +79,8 @@ extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int); extern struct dentry *lookup_one_len(const char *, struct dentry *, int); extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int); +struct qstr; +extern struct dentry *lookup_hash(const struct qstr *, struct dentry *); extern int follow_down_one(struct path *); extern int follow_down(struct path *); diff --git a/include/linux/proportions.h b/include/linux/proportions.h deleted file mode 100644 index 21221338ad18..000000000000 --- a/include/linux/proportions.h +++ /dev/null @@ -1,137 +0,0 @@ -/* - * FLoating proportions - * - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra - * - * This file contains the public data structure and API definitions. - */ - -#ifndef _LINUX_PROPORTIONS_H -#define _LINUX_PROPORTIONS_H - -#include <linux/percpu_counter.h> -#include <linux/spinlock.h> -#include <linux/mutex.h> -#include <linux/gfp.h> - -struct prop_global { - /* - * The period over which we differentiate - * - * period = 2^shift - */ - int shift; - /* - * The total event counter aka 'time'. - * - * Treated as an unsigned long; the lower 'shift - 1' bits are the - * counter bits, the remaining upper bits the period counter. - */ - struct percpu_counter events; -}; - -/* - * global proportion descriptor - * - * this is needed to consistently flip prop_global structures. - */ -struct prop_descriptor { - int index; - struct prop_global pg[2]; - struct mutex mutex; /* serialize the prop_global switch */ -}; - -int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp); -void prop_change_shift(struct prop_descriptor *pd, int new_shift); - -/* - * ----- PERCPU ------ - */ - -struct prop_local_percpu { - /* - * the local events counter - */ - struct percpu_counter events; - - /* - * snapshot of the last seen global state - */ - int shift; - unsigned long period; - raw_spinlock_t lock; /* protect the snapshot state */ -}; - -int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp); -void prop_local_destroy_percpu(struct prop_local_percpu *pl); -void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl); -void prop_fraction_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl, - long *numerator, long *denominator); - -static inline -void prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl) -{ - unsigned long flags; - - local_irq_save(flags); - __prop_inc_percpu(pd, pl); - local_irq_restore(flags); -} - -/* - * Limit the time part in order to ensure there are some bits left for the - * cycle counter and fraction multiply. - */ -#if BITS_PER_LONG == 32 -#define PROP_MAX_SHIFT (3*BITS_PER_LONG/4) -#else -#define PROP_MAX_SHIFT (BITS_PER_LONG/2) -#endif - -#define PROP_FRAC_SHIFT (BITS_PER_LONG - PROP_MAX_SHIFT - 1) -#define PROP_FRAC_BASE (1UL << PROP_FRAC_SHIFT) - -void __prop_inc_percpu_max(struct prop_descriptor *pd, - struct prop_local_percpu *pl, long frac); - - -/* - * ----- SINGLE ------ - */ - -struct prop_local_single { - /* - * the local events counter - */ - unsigned long events; - - /* - * snapshot of the last seen global state - * and a lock protecting this state - */ - unsigned long period; - int shift; - raw_spinlock_t lock; /* protect the snapshot state */ -}; - -#define INIT_PROP_LOCAL_SINGLE(name) \ -{ .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ -} - -int prop_local_init_single(struct prop_local_single *pl); -void prop_local_destroy_single(struct prop_local_single *pl); -void __prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl); -void prop_fraction_single(struct prop_descriptor *pd, struct prop_local_single *pl, - long *numerator, long *denominator); - -static inline -void prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl) -{ - unsigned long flags; - - local_irq_save(flags); - __prop_inc_single(pd, pl); - local_irq_restore(flags); -} - -#endif /* _LINUX_PROPORTIONS_H */ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 2657aff2725b..5f1533e3d032 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -508,14 +508,7 @@ int rcu_read_lock_bh_held(void); * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side * critical section unless it can prove otherwise. */ -#ifdef CONFIG_PREEMPT_COUNT int rcu_read_lock_sched_held(void); -#else /* #ifdef CONFIG_PREEMPT_COUNT */ -static inline int rcu_read_lock_sched_held(void) -{ - return 1; -} -#endif /* #else #ifdef CONFIG_PREEMPT_COUNT */ #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ @@ -532,18 +525,10 @@ static inline int rcu_read_lock_bh_held(void) return 1; } -#ifdef CONFIG_PREEMPT_COUNT static inline int rcu_read_lock_sched_held(void) { - return preempt_count() != 0 || irqs_disabled(); -} -#else /* #ifdef CONFIG_PREEMPT_COUNT */ -static inline int rcu_read_lock_sched_held(void) -{ - return 1; + return !preemptible(); } -#endif /* #else #ifdef CONFIG_PREEMPT_COUNT */ - #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #ifdef CONFIG_PROVE_RCU @@ -1144,4 +1129,17 @@ static inline void rcu_sysidle_force_exit(void) #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ +/* + * Dump the ftrace buffer, but only one time per callsite per boot. + */ +#define rcu_ftrace_dump(oops_dump_mode) \ +do { \ + static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \ + \ + if (!atomic_read(&___rfd_beenhere) && \ + !atomic_xchg(&___rfd_beenhere, 1)) \ + ftrace_dump(oops_dump_mode); \ +} while (0) + + #endif /* __LINUX_RCUPDATE_H */ diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 64809aea661c..93aea75029fb 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -149,6 +149,22 @@ static inline unsigned long rcu_batches_completed_sched(void) return 0; } +/* + * Return the number of expedited grace periods completed. + */ +static inline unsigned long rcu_exp_batches_completed(void) +{ + return 0; +} + +/* + * Return the number of expedited sched grace periods completed. + */ +static inline unsigned long rcu_exp_batches_completed_sched(void) +{ + return 0; +} + static inline void rcu_force_quiescent_state(void) { } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index ad1eda9fa4da..5043cb823fb2 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -87,6 +87,8 @@ unsigned long rcu_batches_started_sched(void); unsigned long rcu_batches_completed(void); unsigned long rcu_batches_completed_bh(void); unsigned long rcu_batches_completed_sched(void); +unsigned long rcu_exp_batches_completed(void); +unsigned long rcu_exp_batches_completed_sched(void); void show_rcu_gp_kthreads(void); void rcu_force_quiescent_state(void); diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h index 561e8615528d..ae0528b834cd 100644 --- a/include/linux/rwsem-spinlock.h +++ b/include/linux/rwsem-spinlock.h @@ -34,7 +34,7 @@ struct rw_semaphore { extern void __down_read(struct rw_semaphore *sem); extern int __down_read_trylock(struct rw_semaphore *sem); extern void __down_write(struct rw_semaphore *sem); -extern void __down_write_nested(struct rw_semaphore *sem, int subclass); +extern int __must_check __down_write_killable(struct rw_semaphore *sem); extern int __down_write_trylock(struct rw_semaphore *sem); extern void __up_read(struct rw_semaphore *sem); extern void __up_write(struct rw_semaphore *sem); diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 8f498cdde280..d1c12d160ace 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -14,6 +14,7 @@ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/atomic.h> +#include <linux/err.h> #ifdef CONFIG_RWSEM_SPIN_ON_OWNER #include <linux/osq_lock.h> #endif @@ -43,6 +44,7 @@ struct rw_semaphore { extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *); extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); @@ -116,6 +118,7 @@ extern int down_read_trylock(struct rw_semaphore *sem); * lock for writing */ extern void down_write(struct rw_semaphore *sem); +extern int __must_check down_write_killable(struct rw_semaphore *sem); /* * trylock for writing -- returns 1 if successful, 0 if contention diff --git a/include/linux/sched.h b/include/linux/sched.h index 52c4847b05e2..e8dfa6f0d843 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -40,7 +40,6 @@ struct sched_param { #include <linux/pid.h> #include <linux/percpu.h> #include <linux/topology.h> -#include <linux/proportions.h> #include <linux/seccomp.h> #include <linux/rcupdate.h> #include <linux/rculist.h> @@ -1596,6 +1595,7 @@ struct task_struct { unsigned long sas_ss_sp; size_t sas_ss_size; + unsigned sas_ss_flags; struct callback_head *task_works; @@ -2575,6 +2575,18 @@ static inline int kill_cad_pid(int sig, int priv) */ static inline int on_sig_stack(unsigned long sp) { + /* + * If the signal stack is SS_AUTODISARM then, by construction, we + * can't be on the signal stack unless user code deliberately set + * SS_AUTODISARM when we were already on it. + * + * This improves reliability: if user state gets corrupted such that + * the stack pointer points very close to the end of the signal stack, + * then this check will enable the signal to be handled anyway. + */ + if (current->sas_ss_flags & SS_AUTODISARM) + return 0; + #ifdef CONFIG_STACK_GROWSUP return sp >= current->sas_ss_sp && sp - current->sas_ss_sp < current->sas_ss_size; @@ -2592,6 +2604,13 @@ static inline int sas_ss_flags(unsigned long sp) return on_sig_stack(sp) ? SS_ONSTACK : 0; } +static inline void sas_ss_reset(struct task_struct *p) +{ + p->sas_ss_sp = 0; + p->sas_ss_size = 0; + p->sas_ss_flags = SS_DISABLE; +} + static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig) { if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp)) diff --git a/include/linux/signal.h b/include/linux/signal.h index 92557bbce7e7..3fbe81444d31 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -432,8 +432,10 @@ int __save_altstack(stack_t __user *, unsigned long); stack_t __user *__uss = uss; \ struct task_struct *t = current; \ put_user_ex((void __user *)t->sas_ss_sp, &__uss->ss_sp); \ - put_user_ex(sas_ss_flags(sp), &__uss->ss_flags); \ + put_user_ex(t->sas_ss_flags, &__uss->ss_flags); \ put_user_ex(t->sas_ss_size, &__uss->ss_size); \ + if (t->sas_ss_flags & SS_AUTODISARM) \ + sas_ss_reset(t); \ } while (0); #ifdef CONFIG_PROC_FS diff --git a/include/linux/swap.h b/include/linux/swap.h index 0a4cd4703f40..ad220359f1b0 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -418,7 +418,7 @@ extern sector_t swapdev_block(int, pgoff_t); extern int page_swapcount(struct page *); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); -extern int reuse_swap_page(struct page *); +extern bool reuse_swap_page(struct page *, int *); extern int try_to_free_swap(struct page *); struct backing_dev_info; @@ -513,8 +513,8 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -#define reuse_swap_page(page) \ - (!PageTransCompound(page) && page_mapcount(page) == 1) +#define reuse_swap_page(page, total_mapcount) \ + (page_trans_huge_mapcount(page, total_mapcount) == 1) static inline int try_to_free_swap(struct page *page) { diff --git a/include/linux/uio.h b/include/linux/uio.h index fd9bcfedad42..1b5d1cd796e2 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -87,6 +87,7 @@ size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i); size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i); size_t iov_iter_zero(size_t bytes, struct iov_iter *); unsigned long iov_iter_alignment(const struct iov_iter *i); +unsigned long iov_iter_gap_alignment(const struct iov_iter *i); void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov, unsigned long nr_segs, size_t count); void iov_iter_kvec(struct iov_iter *i, int direction, const struct kvec *kvec, diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index ef72c4aada56..d3e756539d44 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -172,6 +172,77 @@ TRACE_EVENT(rcu_grace_period_init, ); /* + * Tracepoint for expedited grace-period events. Takes a string identifying + * the RCU flavor, the expedited grace-period sequence number, and a string + * identifying the grace-period-related event as follows: + * + * "snap": Captured snapshot of expedited grace period sequence number. + * "start": Started a real expedited grace period. + * "end": Ended a real expedited grace period. + * "endwake": Woke piggybackers up. + * "done": Someone else did the expedited grace period for us. + */ +TRACE_EVENT(rcu_exp_grace_period, + + TP_PROTO(const char *rcuname, unsigned long gpseq, const char *gpevent), + + TP_ARGS(rcuname, gpseq, gpevent), + + TP_STRUCT__entry( + __field(const char *, rcuname) + __field(unsigned long, gpseq) + __field(const char *, gpevent) + ), + + TP_fast_assign( + __entry->rcuname = rcuname; + __entry->gpseq = gpseq; + __entry->gpevent = gpevent; + ), + + TP_printk("%s %lu %s", + __entry->rcuname, __entry->gpseq, __entry->gpevent) +); + +/* + * Tracepoint for expedited grace-period funnel-locking events. Takes a + * string identifying the RCU flavor, an integer identifying the rcu_node + * combining-tree level, another pair of integers identifying the lowest- + * and highest-numbered CPU associated with the current rcu_node structure, + * and a string. identifying the grace-period-related event as follows: + * + * "nxtlvl": Advance to next level of rcu_node funnel + * "wait": Wait for someone else to do expedited GP + */ +TRACE_EVENT(rcu_exp_funnel_lock, + + TP_PROTO(const char *rcuname, u8 level, int grplo, int grphi, + const char *gpevent), + + TP_ARGS(rcuname, level, grplo, grphi, gpevent), + + TP_STRUCT__entry( + __field(const char *, rcuname) + __field(u8, level) + __field(int, grplo) + __field(int, grphi) + __field(const char *, gpevent) + ), + + TP_fast_assign( + __entry->rcuname = rcuname; + __entry->level = level; + __entry->grplo = grplo; + __entry->grphi = grphi; + __entry->gpevent = gpevent; + ), + + TP_printk("%s %d %d %d %s", + __entry->rcuname, __entry->level, __entry->grplo, + __entry->grphi, __entry->gpevent) +); + +/* * Tracepoint for RCU no-CBs CPU callback handoffs. This event is intended * to assist debugging of these handoffs. * @@ -704,11 +775,15 @@ TRACE_EVENT(rcu_barrier, #else /* #ifdef CONFIG_RCU_TRACE */ #define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0) -#define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, \ - qsmask) do { } while (0) #define trace_rcu_future_grace_period(rcuname, gpnum, completed, c, \ level, grplo, grphi, event) \ do { } while (0) +#define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, \ + qsmask) do { } while (0) +#define trace_rcu_exp_grace_period(rcuname, gqseq, gpevent) \ + do { } while (0) +#define trace_rcu_exp_funnel_lock(rcuname, level, grplo, grphi, gpevent) \ + do { } while (0) #define trace_rcu_nocb_wake(rcuname, cpu, reason) do { } while (0) #define trace_rcu_preempt_task(rcuname, pid, gpnum) do { } while (0) #define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0) diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h index f80277569f24..e601c8c3bdc7 100644 --- a/include/uapi/linux/if.h +++ b/include/uapi/linux/if.h @@ -19,14 +19,20 @@ #ifndef _LINUX_IF_H #define _LINUX_IF_H +#include <linux/libc-compat.h> /* for compatibility with glibc */ #include <linux/types.h> /* for "__kernel_caddr_t" et al */ #include <linux/socket.h> /* for "struct sockaddr" et al */ #include <linux/compiler.h> /* for "__user" et al */ +#if __UAPI_DEF_IF_IFNAMSIZ #define IFNAMSIZ 16 +#endif /* __UAPI_DEF_IF_IFNAMSIZ */ #define IFALIASZ 256 #include <linux/hdlc/ioctl.h> +/* For glibc compatibility. An empty enum does not compile. */ +#if __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO != 0 && \ + __UAPI_DEF_IF_NET_DEVICE_FLAGS != 0 /** * enum net_device_flags - &struct net_device flags * @@ -68,6 +74,8 @@ * @IFF_ECHO: echo sent packets. Volatile. */ enum net_device_flags { +/* for compatibility with glibc net/if.h */ +#if __UAPI_DEF_IF_NET_DEVICE_FLAGS IFF_UP = 1<<0, /* sysfs */ IFF_BROADCAST = 1<<1, /* volatile */ IFF_DEBUG = 1<<2, /* sysfs */ @@ -84,11 +92,17 @@ enum net_device_flags { IFF_PORTSEL = 1<<13, /* sysfs */ IFF_AUTOMEDIA = 1<<14, /* sysfs */ IFF_DYNAMIC = 1<<15, /* sysfs */ +#endif /* __UAPI_DEF_IF_NET_DEVICE_FLAGS */ +#if __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO IFF_LOWER_UP = 1<<16, /* volatile */ IFF_DORMANT = 1<<17, /* volatile */ IFF_ECHO = 1<<18, /* volatile */ +#endif /* __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO */ }; +#endif /* __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO != 0 && __UAPI_DEF_IF_NET_DEVICE_FLAGS != 0 */ +/* for compatibility with glibc net/if.h */ +#if __UAPI_DEF_IF_NET_DEVICE_FLAGS #define IFF_UP IFF_UP #define IFF_BROADCAST IFF_BROADCAST #define IFF_DEBUG IFF_DEBUG @@ -105,9 +119,13 @@ enum net_device_flags { #define IFF_PORTSEL IFF_PORTSEL #define IFF_AUTOMEDIA IFF_AUTOMEDIA #define IFF_DYNAMIC IFF_DYNAMIC +#endif /* __UAPI_DEF_IF_NET_DEVICE_FLAGS */ + +#if __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO #define IFF_LOWER_UP IFF_LOWER_UP #define IFF_DORMANT IFF_DORMANT #define IFF_ECHO IFF_ECHO +#endif /* __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO */ #define IFF_VOLATILE (IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|IFF_ECHO|\ IFF_MASTER|IFF_SLAVE|IFF_RUNNING|IFF_LOWER_UP|IFF_DORMANT) @@ -166,6 +184,8 @@ enum { * being very small might be worth keeping for clean configuration. */ +/* for compatibility with glibc net/if.h */ +#if __UAPI_DEF_IF_IFMAP struct ifmap { unsigned long mem_start; unsigned long mem_end; @@ -175,6 +195,7 @@ struct ifmap { unsigned char port; /* 3 bytes spare */ }; +#endif /* __UAPI_DEF_IF_IFMAP */ struct if_settings { unsigned int type; /* Type of physical device or protocol */ @@ -200,6 +221,8 @@ struct if_settings { * remainder may be interface specific. */ +/* for compatibility with glibc net/if.h */ +#if __UAPI_DEF_IF_IFREQ struct ifreq { #define IFHWADDRLEN 6 union @@ -223,6 +246,7 @@ struct ifreq { struct if_settings ifru_settings; } ifr_ifru; }; +#endif /* __UAPI_DEF_IF_IFREQ */ #define ifr_name ifr_ifrn.ifrn_name /* interface name */ #define ifr_hwaddr ifr_ifru.ifru_hwaddr /* MAC address */ @@ -249,6 +273,8 @@ struct ifreq { * must know all networks accessible). */ +/* for compatibility with glibc net/if.h */ +#if __UAPI_DEF_IF_IFCONF struct ifconf { int ifc_len; /* size of buffer */ union { @@ -256,6 +282,8 @@ struct ifconf { struct ifreq __user *ifcu_req; } ifc_ifcu; }; +#endif /* __UAPI_DEF_IF_IFCONF */ + #define ifc_buf ifc_ifcu.ifcu_buf /* buffer address */ #define ifc_req ifc_ifcu.ifcu_req /* array of structures */ diff --git a/include/uapi/linux/libc-compat.h b/include/uapi/linux/libc-compat.h index 7d024ceb075d..d5e38c73377c 100644 --- a/include/uapi/linux/libc-compat.h +++ b/include/uapi/linux/libc-compat.h @@ -51,6 +51,40 @@ /* We have included glibc headers... */ #if defined(__GLIBC__) +/* Coordinate with glibc net/if.h header. */ +#if defined(_NET_IF_H) + +/* GLIBC headers included first so don't define anything + * that would already be defined. */ + +#define __UAPI_DEF_IF_IFCONF 0 +#define __UAPI_DEF_IF_IFMAP 0 +#define __UAPI_DEF_IF_IFNAMSIZ 0 +#define __UAPI_DEF_IF_IFREQ 0 +/* Everything up to IFF_DYNAMIC, matches net/if.h until glibc 2.23 */ +#define __UAPI_DEF_IF_NET_DEVICE_FLAGS 0 +/* For the future if glibc adds IFF_LOWER_UP, IFF_DORMANT and IFF_ECHO */ +#ifndef __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO +#define __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO 1 +#endif /* __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO */ + +#else /* _NET_IF_H */ + +/* Linux headers included first, and we must define everything + * we need. The expectation is that glibc will check the + * __UAPI_DEF_* defines and adjust appropriately. */ + +#define __UAPI_DEF_IF_IFCONF 1 +#define __UAPI_DEF_IF_IFMAP 1 +#define __UAPI_DEF_IF_IFNAMSIZ 1 +#define __UAPI_DEF_IF_IFREQ 1 +/* Everything up to IFF_DYNAMIC, matches net/if.h until glibc 2.23 */ +#define __UAPI_DEF_IF_NET_DEVICE_FLAGS 1 +/* For the future if glibc adds IFF_LOWER_UP, IFF_DORMANT and IFF_ECHO */ +#define __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO 1 + +#endif /* _NET_IF_H */ + /* Coordinate with glibc netinet/in.h header. */ #if defined(_NETINET_IN_H) @@ -117,6 +151,16 @@ * that we need. */ #else /* !defined(__GLIBC__) */ +/* Definitions for if.h */ +#define __UAPI_DEF_IF_IFCONF 1 +#define __UAPI_DEF_IF_IFMAP 1 +#define __UAPI_DEF_IF_IFNAMSIZ 1 +#define __UAPI_DEF_IF_IFREQ 1 +/* Everything up to IFF_DYNAMIC, matches net/if.h until glibc 2.23 */ +#define __UAPI_DEF_IF_NET_DEVICE_FLAGS 1 +/* For the future if glibc adds IFF_LOWER_UP, IFF_DORMANT and IFF_ECHO */ +#define __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO 1 + /* Definitions for in.h */ #define __UAPI_DEF_IN_ADDR 1 #define __UAPI_DEF_IN_IPPROTO 1 diff --git a/include/uapi/linux/signal.h b/include/uapi/linux/signal.h index e1bd50c29ded..cd0804b6bfa2 100644 --- a/include/uapi/linux/signal.h +++ b/include/uapi/linux/signal.h @@ -7,4 +7,9 @@ #define SS_ONSTACK 1 #define SS_DISABLE 2 +/* bit-flags */ +#define SS_AUTODISARM (1U << 31) /* disable sas during sighandling */ +/* mask for all SS_xxx flags */ +#define SS_FLAG_BITS SS_AUTODISARM + #endif /* _UAPI_LINUX_SIGNAL_H */ diff --git a/include/uapi/linux/tc_act/Kbuild b/include/uapi/linux/tc_act/Kbuild index 242cf0c6e33d..e3969bd939e4 100644 --- a/include/uapi/linux/tc_act/Kbuild +++ b/include/uapi/linux/tc_act/Kbuild @@ -10,3 +10,4 @@ header-y += tc_skbedit.h header-y += tc_vlan.h header-y += tc_bpf.h header-y += tc_connmark.h +header-y += tc_ife.h diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 909a7d31ffd3..86cb5c6e8932 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1215,6 +1215,41 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_free_root(root); } +/* + * look up cgroup associated with current task's cgroup namespace on the + * specified hierarchy + */ +static struct cgroup * +current_cgns_cgroup_from_root(struct cgroup_root *root) +{ + struct cgroup *res = NULL; + struct css_set *cset; + + lockdep_assert_held(&css_set_lock); + + rcu_read_lock(); + + cset = current->nsproxy->cgroup_ns->root_cset; + if (cset == &init_css_set) { + res = &root->cgrp; + } else { + struct cgrp_cset_link *link; + + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { + struct cgroup *c = link->cgrp; + + if (c->root == root) { + res = c; + break; + } + } + } + rcu_read_unlock(); + + BUG_ON(!res); + return res; +} + /* look up cgroup associated with given css_set on the specified hierarchy */ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) @@ -1593,6 +1628,33 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) return 0; } +static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, + struct kernfs_root *kf_root) +{ + int len = 0; + char *buf = NULL; + struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root); + struct cgroup *ns_cgroup; + + buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + spin_lock_bh(&css_set_lock); + ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot); + len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX); + spin_unlock_bh(&css_set_lock); + + if (len >= PATH_MAX) + len = -ERANGE; + else if (len > 0) { + seq_escape(sf, buf, " \t\n\\"); + len = 0; + } + kfree(buf); + return len; +} + static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root) { @@ -5433,6 +5495,7 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .rename = cgroup_rename, + .show_path = cgroup_show_path, }; static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) diff --git a/kernel/fork.c b/kernel/fork.c index d277e83ed3e0..3e8451527cbe 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1494,7 +1494,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, * sigaltstack should be cleared when sharing the same VM */ if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) - p->sas_ss_sp = p->sas_ss_size = 0; + sas_ss_reset(p); /* * Syscall tracing and stepping should be turned off in the diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 78c1c0ee6dc1..874d53eaf389 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -708,7 +708,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) * yet. Otherwise we look it up. We cache the result in the lock object * itself, so actual lookup of the hash should be once per lock object. */ -static inline struct lock_class * +static struct lock_class * register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) { struct lockdep_subclass_key *key; diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 8ef1919d63b2..f8c5af52a131 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -75,12 +75,7 @@ struct lock_stress_stats { long n_lock_acquired; }; -#if defined(MODULE) -#define LOCKTORTURE_RUNNABLE_INIT 1 -#else -#define LOCKTORTURE_RUNNABLE_INIT 0 -#endif -int torture_runnable = LOCKTORTURE_RUNNABLE_INIT; +int torture_runnable = IS_ENABLED(MODULE); module_param(torture_runnable, int, 0444); MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init"); @@ -394,12 +389,12 @@ static void torture_rtmutex_boost(struct torture_random_state *trsp) if (!rt_task(current)) { /* - * (1) Boost priority once every ~50k operations. When the + * Boost priority once every ~50k operations. When the * task tries to take the lock, the rtmutex it will account * for the new priority, and do any corresponding pi-dance. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * factor))) { + if (trsp && !(torture_random(trsp) % + (cxt.nrealwriters_stress * factor))) { policy = SCHED_FIFO; param.sched_priority = MAX_RT_PRIO - 1; } else /* common case, do nothing */ @@ -748,6 +743,15 @@ static void lock_torture_cleanup(void) if (torture_cleanup_begin()) return; + /* + * Indicates early cleanup, meaning that the test has not run, + * such as when passing bogus args when loading the module. As + * such, only perform the underlying torture-specific cleanups, + * and avoid anything related to locktorture. + */ + if (!cxt.lwsa) + goto end; + if (writer_tasks) { for (i = 0; i < cxt.nrealwriters_stress; i++) torture_stop_kthread(lock_torture_writer, @@ -776,6 +780,7 @@ static void lock_torture_cleanup(void) else lock_torture_print_module_parms(cxt.cur_ops, "End of test: SUCCESS"); +end: torture_cleanup_end(); } @@ -870,6 +875,7 @@ static int __init lock_torture_init(void) VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory"); firsterr = -ENOMEM; kfree(cxt.lwsa); + cxt.lwsa = NULL; goto unwind; } @@ -878,6 +884,7 @@ static int __init lock_torture_init(void) cxt.lrsa[i].n_lock_acquired = 0; } } + lock_torture_print_module_parms(cxt.cur_ops, "Start of test"); /* Prepare torture context. */ diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index d734b7502001..22e025309845 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -191,8 +191,6 @@ static ssize_t qstat_write(struct file *file, const char __user *user_buf, for (i = 0 ; i < qstat_num; i++) WRITE_ONCE(ptr[i], 0); - for (i = 0 ; i < qstat_num; i++) - WRITE_ONCE(ptr[i], 0); } return count; } @@ -214,10 +212,8 @@ static int __init init_qspinlock_stat(void) struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL); int i; - if (!d_qstat) { - pr_warn("Could not create 'qlockstat' debugfs directory\n"); - return 0; - } + if (!d_qstat) + goto out; /* * Create the debugfs files @@ -227,12 +223,20 @@ static int __init init_qspinlock_stat(void) * performance. */ for (i = 0; i < qstat_num; i++) - debugfs_create_file(qstat_names[i], 0400, d_qstat, - (void *)(long)i, &fops_qstat); + if (!debugfs_create_file(qstat_names[i], 0400, d_qstat, + (void *)(long)i, &fops_qstat)) + goto fail_undo; + + if (!debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat, + (void *)(long)qstat_reset_cnts, &fops_qstat)) + goto fail_undo; - debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat, - (void *)(long)qstat_reset_cnts, &fops_qstat); return 0; +fail_undo: + debugfs_remove_recursive(d_qstat); +out: + pr_warn("Could not create 'qlockstat' debugfs entries\n"); + return -ENOMEM; } fs_initcall(init_qspinlock_stat); diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 3a5048572065..1591f6b3539f 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -191,11 +191,12 @@ int __down_read_trylock(struct rw_semaphore *sem) /* * get a write lock on the semaphore */ -void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) +int __sched __down_write_common(struct rw_semaphore *sem, int state) { struct rwsem_waiter waiter; struct task_struct *tsk; unsigned long flags; + int ret = 0; raw_spin_lock_irqsave(&sem->wait_lock, flags); @@ -215,21 +216,33 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) */ if (sem->count == 0) break; - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (signal_pending_state(state, current)) { + ret = -EINTR; + goto out; + } + set_task_state(tsk, state); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); schedule(); raw_spin_lock_irqsave(&sem->wait_lock, flags); } /* got the lock */ sem->count = -1; +out: list_del(&waiter.list); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + return ret; } void __sched __down_write(struct rw_semaphore *sem) { - __down_write_nested(sem, 0); + __down_write_common(sem, TASK_UNINTERRUPTIBLE); +} + +int __sched __down_write_killable(struct rw_semaphore *sem) +{ + return __down_write_common(sem, TASK_KILLABLE); } /* diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index a4d4de05b2d1..09e30c6225e5 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -433,12 +433,13 @@ static inline bool rwsem_has_spinner(struct rw_semaphore *sem) /* * Wait until we successfully acquire the write lock */ -__visible -struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) +static inline struct rw_semaphore * +__rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) { long count; bool waiting = true; /* any queued threads before us */ struct rwsem_waiter waiter; + struct rw_semaphore *ret = sem; /* undo write bias from down_write operation, stop active locking */ count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem); @@ -478,7 +479,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); /* wait until we successfully acquire the lock */ - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(state); while (true) { if (rwsem_try_write_lock(count, sem)) break; @@ -486,21 +487,48 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) /* Block until there are no active lockers. */ do { + if (signal_pending_state(state, current)) + goto out_nolock; + schedule(); - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(state); } while ((count = sem->count) & RWSEM_ACTIVE_MASK); raw_spin_lock_irq(&sem->wait_lock); } __set_current_state(TASK_RUNNING); + list_del(&waiter.list); + raw_spin_unlock_irq(&sem->wait_lock); + return ret; + +out_nolock: + __set_current_state(TASK_RUNNING); + raw_spin_lock_irq(&sem->wait_lock); list_del(&waiter.list); + if (list_empty(&sem->wait_list)) + rwsem_atomic_update(-RWSEM_WAITING_BIAS, sem); + else + __rwsem_do_wake(sem, RWSEM_WAKE_ANY); raw_spin_unlock_irq(&sem->wait_lock); - return sem; + return ERR_PTR(-EINTR); +} + +__visible struct rw_semaphore * __sched +rwsem_down_write_failed(struct rw_semaphore *sem) +{ + return __rwsem_down_write_failed_common(sem, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(rwsem_down_write_failed); +__visible struct rw_semaphore * __sched +rwsem_down_write_failed_killable(struct rw_semaphore *sem) +{ + return __rwsem_down_write_failed_common(sem, TASK_KILLABLE); +} +EXPORT_SYMBOL(rwsem_down_write_failed_killable); + /* * handle waking up a waiter on the semaphore * - up_read/up_write has decremented the active part of count if we come here diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 205be0ce34de..c817216c1615 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -55,6 +55,25 @@ void __sched down_write(struct rw_semaphore *sem) EXPORT_SYMBOL(down_write); /* + * lock for writing + */ +int __sched down_write_killable(struct rw_semaphore *sem) +{ + might_sleep(); + rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); + + if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) { + rwsem_release(&sem->dep_map, 1, _RET_IP_); + return -EINTR; + } + + rwsem_set_owner(sem); + return 0; +} + +EXPORT_SYMBOL(down_write_killable); + +/* * trylock for writing -- returns 1 if successful, 0 if contention */ int down_write_trylock(struct rw_semaphore *sem) diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 032b2c015beb..18dfc485225c 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -5,6 +5,7 @@ KCOV_INSTRUMENT := n obj-y += update.o sync.o obj-$(CONFIG_SRCU) += srcu.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o +obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_PREEMPT_RCU) += tree.o obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c new file mode 100644 index 000000000000..3cee0d8393ed --- /dev/null +++ b/kernel/rcu/rcuperf.c @@ -0,0 +1,655 @@ +/* + * Read-Copy Update module-based performance-test facility + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2015 + * + * Authors: Paul E. McKenney <paulmck@us.ibm.com> + */ +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/err.h> +#include <linux/spinlock.h> +#include <linux/smp.h> +#include <linux/rcupdate.h> +#include <linux/interrupt.h> +#include <linux/sched.h> +#include <linux/atomic.h> +#include <linux/bitops.h> +#include <linux/completion.h> +#include <linux/moduleparam.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/reboot.h> +#include <linux/freezer.h> +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/stat.h> +#include <linux/srcu.h> +#include <linux/slab.h> +#include <asm/byteorder.h> +#include <linux/torture.h> +#include <linux/vmalloc.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); + +#define PERF_FLAG "-perf:" +#define PERFOUT_STRING(s) \ + pr_alert("%s" PERF_FLAG s "\n", perf_type) +#define VERBOSE_PERFOUT_STRING(s) \ + do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0) +#define VERBOSE_PERFOUT_ERRSTRING(s) \ + do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) + +torture_param(bool, gp_exp, true, "Use expedited GP wait primitives"); +torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); +torture_param(int, nreaders, -1, "Number of RCU reader threads"); +torture_param(int, nwriters, -1, "Number of RCU updater threads"); +torture_param(bool, shutdown, false, "Shutdown at end of performance tests."); +torture_param(bool, verbose, true, "Enable verbose debugging printk()s"); + +static char *perf_type = "rcu"; +module_param(perf_type, charp, 0444); +MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, ...)"); + +static int nrealreaders; +static int nrealwriters; +static struct task_struct **writer_tasks; +static struct task_struct **reader_tasks; +static struct task_struct *shutdown_task; + +static u64 **writer_durations; +static int *writer_n_durations; +static atomic_t n_rcu_perf_reader_started; +static atomic_t n_rcu_perf_writer_started; +static atomic_t n_rcu_perf_writer_finished; +static wait_queue_head_t shutdown_wq; +static u64 t_rcu_perf_writer_started; +static u64 t_rcu_perf_writer_finished; +static unsigned long b_rcu_perf_writer_started; +static unsigned long b_rcu_perf_writer_finished; + +static int rcu_perf_writer_state; +#define RTWS_INIT 0 +#define RTWS_EXP_SYNC 1 +#define RTWS_SYNC 2 +#define RTWS_IDLE 2 +#define RTWS_STOPPING 3 + +#define MAX_MEAS 10000 +#define MIN_MEAS 100 + +#if defined(MODULE) || defined(CONFIG_RCU_PERF_TEST_RUNNABLE) +#define RCUPERF_RUNNABLE_INIT 1 +#else +#define RCUPERF_RUNNABLE_INIT 0 +#endif +static int perf_runnable = RCUPERF_RUNNABLE_INIT; +module_param(perf_runnable, int, 0444); +MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot"); + +/* + * Operations vector for selecting different types of tests. + */ + +struct rcu_perf_ops { + int ptype; + void (*init)(void); + void (*cleanup)(void); + int (*readlock)(void); + void (*readunlock)(int idx); + unsigned long (*started)(void); + unsigned long (*completed)(void); + unsigned long (*exp_completed)(void); + void (*sync)(void); + void (*exp_sync)(void); + const char *name; +}; + +static struct rcu_perf_ops *cur_ops; + +/* + * Definitions for rcu perf testing. + */ + +static int rcu_perf_read_lock(void) __acquires(RCU) +{ + rcu_read_lock(); + return 0; +} + +static void rcu_perf_read_unlock(int idx) __releases(RCU) +{ + rcu_read_unlock(); +} + +static unsigned long __maybe_unused rcu_no_completed(void) +{ + return 0; +} + +static void rcu_sync_perf_init(void) +{ +} + +static struct rcu_perf_ops rcu_ops = { + .ptype = RCU_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = rcu_perf_read_lock, + .readunlock = rcu_perf_read_unlock, + .started = rcu_batches_started, + .completed = rcu_batches_completed, + .exp_completed = rcu_exp_batches_completed, + .sync = synchronize_rcu, + .exp_sync = synchronize_rcu_expedited, + .name = "rcu" +}; + +/* + * Definitions for rcu_bh perf testing. + */ + +static int rcu_bh_perf_read_lock(void) __acquires(RCU_BH) +{ + rcu_read_lock_bh(); + return 0; +} + +static void rcu_bh_perf_read_unlock(int idx) __releases(RCU_BH) +{ + rcu_read_unlock_bh(); +} + +static struct rcu_perf_ops rcu_bh_ops = { + .ptype = RCU_BH_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = rcu_bh_perf_read_lock, + .readunlock = rcu_bh_perf_read_unlock, + .started = rcu_batches_started_bh, + .completed = rcu_batches_completed_bh, + .exp_completed = rcu_exp_batches_completed_sched, + .sync = synchronize_rcu_bh, + .exp_sync = synchronize_rcu_bh_expedited, + .name = "rcu_bh" +}; + +/* + * Definitions for srcu perf testing. + */ + +DEFINE_STATIC_SRCU(srcu_ctl_perf); +static struct srcu_struct *srcu_ctlp = &srcu_ctl_perf; + +static int srcu_perf_read_lock(void) __acquires(srcu_ctlp) +{ + return srcu_read_lock(srcu_ctlp); +} + +static void srcu_perf_read_unlock(int idx) __releases(srcu_ctlp) +{ + srcu_read_unlock(srcu_ctlp, idx); +} + +static unsigned long srcu_perf_completed(void) +{ + return srcu_batches_completed(srcu_ctlp); +} + +static void srcu_perf_synchronize(void) +{ + synchronize_srcu(srcu_ctlp); +} + +static void srcu_perf_synchronize_expedited(void) +{ + synchronize_srcu_expedited(srcu_ctlp); +} + +static struct rcu_perf_ops srcu_ops = { + .ptype = SRCU_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = srcu_perf_read_lock, + .readunlock = srcu_perf_read_unlock, + .started = NULL, + .completed = srcu_perf_completed, + .exp_completed = srcu_perf_completed, + .sync = srcu_perf_synchronize, + .exp_sync = srcu_perf_synchronize_expedited, + .name = "srcu" +}; + +/* + * Definitions for sched perf testing. + */ + +static int sched_perf_read_lock(void) +{ + preempt_disable(); + return 0; +} + +static void sched_perf_read_unlock(int idx) +{ + preempt_enable(); +} + +static struct rcu_perf_ops sched_ops = { + .ptype = RCU_SCHED_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = sched_perf_read_lock, + .readunlock = sched_perf_read_unlock, + .started = rcu_batches_started_sched, + .completed = rcu_batches_completed_sched, + .exp_completed = rcu_exp_batches_completed_sched, + .sync = synchronize_sched, + .exp_sync = synchronize_sched_expedited, + .name = "sched" +}; + +#ifdef CONFIG_TASKS_RCU + +/* + * Definitions for RCU-tasks perf testing. + */ + +static int tasks_perf_read_lock(void) +{ + return 0; +} + +static void tasks_perf_read_unlock(int idx) +{ +} + +static struct rcu_perf_ops tasks_ops = { + .ptype = RCU_TASKS_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = tasks_perf_read_lock, + .readunlock = tasks_perf_read_unlock, + .started = rcu_no_completed, + .completed = rcu_no_completed, + .sync = synchronize_rcu_tasks, + .exp_sync = synchronize_rcu_tasks, + .name = "tasks" +}; + +#define RCUPERF_TASKS_OPS &tasks_ops, + +static bool __maybe_unused torturing_tasks(void) +{ + return cur_ops == &tasks_ops; +} + +#else /* #ifdef CONFIG_TASKS_RCU */ + +#define RCUPERF_TASKS_OPS + +static bool __maybe_unused torturing_tasks(void) +{ + return false; +} + +#endif /* #else #ifdef CONFIG_TASKS_RCU */ + +/* + * If performance tests complete, wait for shutdown to commence. + */ +static void rcu_perf_wait_shutdown(void) +{ + cond_resched_rcu_qs(); + if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters) + return; + while (!torture_must_stop()) + schedule_timeout_uninterruptible(1); +} + +/* + * RCU perf reader kthread. Repeatedly does empty RCU read-side + * critical section, minimizing update-side interference. + */ +static int +rcu_perf_reader(void *arg) +{ + unsigned long flags; + int idx; + long me = (long)arg; + + VERBOSE_PERFOUT_STRING("rcu_perf_reader task started"); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + atomic_inc(&n_rcu_perf_reader_started); + + do { + local_irq_save(flags); + idx = cur_ops->readlock(); + cur_ops->readunlock(idx); + local_irq_restore(flags); + rcu_perf_wait_shutdown(); + } while (!torture_must_stop()); + torture_kthread_stopping("rcu_perf_reader"); + return 0; +} + +/* + * RCU perf writer kthread. Repeatedly does a grace period. + */ +static int +rcu_perf_writer(void *arg) +{ + int i = 0; + int i_max; + long me = (long)arg; + struct sched_param sp; + bool started = false, done = false, alldone = false; + u64 t; + u64 *wdp; + u64 *wdpp = writer_durations[me]; + + VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); + WARN_ON(rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp); + WARN_ON(rcu_gp_is_normal() && gp_exp); + WARN_ON(!wdpp); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + sp.sched_priority = 1; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + + if (holdoff) + schedule_timeout_uninterruptible(holdoff * HZ); + + t = ktime_get_mono_fast_ns(); + if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { + t_rcu_perf_writer_started = t; + if (gp_exp) { + b_rcu_perf_writer_started = + cur_ops->exp_completed() / 2; + } else { + b_rcu_perf_writer_started = + cur_ops->completed(); + } + } + + do { + wdp = &wdpp[i]; + *wdp = ktime_get_mono_fast_ns(); + if (gp_exp) { + rcu_perf_writer_state = RTWS_EXP_SYNC; + cur_ops->exp_sync(); + } else { + rcu_perf_writer_state = RTWS_SYNC; + cur_ops->sync(); + } + rcu_perf_writer_state = RTWS_IDLE; + t = ktime_get_mono_fast_ns(); + *wdp = t - *wdp; + i_max = i; + if (!started && + atomic_read(&n_rcu_perf_writer_started) >= nrealwriters) + started = true; + if (!done && i >= MIN_MEAS) { + done = true; + sp.sched_priority = 0; + sched_setscheduler_nocheck(current, + SCHED_NORMAL, &sp); + pr_alert("%s" PERF_FLAG + "rcu_perf_writer %ld has %d measurements\n", + perf_type, me, MIN_MEAS); + if (atomic_inc_return(&n_rcu_perf_writer_finished) >= + nrealwriters) { + schedule_timeout_interruptible(10); + rcu_ftrace_dump(DUMP_ALL); + PERFOUT_STRING("Test complete"); + t_rcu_perf_writer_finished = t; + if (gp_exp) { + b_rcu_perf_writer_finished = + cur_ops->exp_completed() / 2; + } else { + b_rcu_perf_writer_finished = + cur_ops->completed(); + } + if (shutdown) { + smp_mb(); /* Assign before wake. */ + wake_up(&shutdown_wq); + } + } + } + if (done && !alldone && + atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters) + alldone = true; + if (started && !alldone && i < MAX_MEAS - 1) + i++; + rcu_perf_wait_shutdown(); + } while (!torture_must_stop()); + rcu_perf_writer_state = RTWS_STOPPING; + writer_n_durations[me] = i_max; + torture_kthread_stopping("rcu_perf_writer"); + return 0; +} + +static inline void +rcu_perf_print_module_parms(struct rcu_perf_ops *cur_ops, const char *tag) +{ + pr_alert("%s" PERF_FLAG + "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n", + perf_type, tag, nrealreaders, nrealwriters, verbose, shutdown); +} + +static void +rcu_perf_cleanup(void) +{ + int i; + int j; + int ngps = 0; + u64 *wdp; + u64 *wdpp; + + if (torture_cleanup_begin()) + return; + + if (reader_tasks) { + for (i = 0; i < nrealreaders; i++) + torture_stop_kthread(rcu_perf_reader, + reader_tasks[i]); + kfree(reader_tasks); + } + + if (writer_tasks) { + for (i = 0; i < nrealwriters; i++) { + torture_stop_kthread(rcu_perf_writer, + writer_tasks[i]); + if (!writer_n_durations) + continue; + j = writer_n_durations[i]; + pr_alert("%s%s writer %d gps: %d\n", + perf_type, PERF_FLAG, i, j); + ngps += j; + } + pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n", + perf_type, PERF_FLAG, + t_rcu_perf_writer_started, t_rcu_perf_writer_finished, + t_rcu_perf_writer_finished - + t_rcu_perf_writer_started, + ngps, + b_rcu_perf_writer_finished - + b_rcu_perf_writer_started); + for (i = 0; i < nrealwriters; i++) { + if (!writer_durations) + break; + if (!writer_n_durations) + continue; + wdpp = writer_durations[i]; + if (!wdpp) + continue; + for (j = 0; j <= writer_n_durations[i]; j++) { + wdp = &wdpp[j]; + pr_alert("%s%s %4d writer-duration: %5d %llu\n", + perf_type, PERF_FLAG, + i, j, *wdp); + if (j % 100 == 0) + schedule_timeout_uninterruptible(1); + } + kfree(writer_durations[i]); + } + kfree(writer_tasks); + kfree(writer_durations); + kfree(writer_n_durations); + } + + /* Do flavor-specific cleanup operations. */ + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); + + torture_cleanup_end(); +} + +/* + * Return the number if non-negative. If -1, the number of CPUs. + * If less than -1, that much less than the number of CPUs, but + * at least one. + */ +static int compute_real(int n) +{ + int nr; + + if (n >= 0) { + nr = n; + } else { + nr = num_online_cpus() + 1 + n; + if (nr <= 0) + nr = 1; + } + return nr; +} + +/* + * RCU perf shutdown kthread. Just waits to be awakened, then shuts + * down system. + */ +static int +rcu_perf_shutdown(void *arg) +{ + do { + wait_event(shutdown_wq, + atomic_read(&n_rcu_perf_writer_finished) >= + nrealwriters); + } while (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters); + smp_mb(); /* Wake before output. */ + rcu_perf_cleanup(); + kernel_power_off(); + return -EINVAL; +} + +static int __init +rcu_perf_init(void) +{ + long i; + int firsterr = 0; + static struct rcu_perf_ops *perf_ops[] = { + &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, + RCUPERF_TASKS_OPS + }; + + if (!torture_init_begin(perf_type, verbose, &perf_runnable)) + return -EBUSY; + + /* Process args and tell the world that the perf'er is on the job. */ + for (i = 0; i < ARRAY_SIZE(perf_ops); i++) { + cur_ops = perf_ops[i]; + if (strcmp(perf_type, cur_ops->name) == 0) + break; + } + if (i == ARRAY_SIZE(perf_ops)) { + pr_alert("rcu-perf: invalid perf type: \"%s\"\n", + perf_type); + pr_alert("rcu-perf types:"); + for (i = 0; i < ARRAY_SIZE(perf_ops); i++) + pr_alert(" %s", perf_ops[i]->name); + pr_alert("\n"); + firsterr = -EINVAL; + goto unwind; + } + if (cur_ops->init) + cur_ops->init(); + + nrealwriters = compute_real(nwriters); + nrealreaders = compute_real(nreaders); + atomic_set(&n_rcu_perf_reader_started, 0); + atomic_set(&n_rcu_perf_writer_started, 0); + atomic_set(&n_rcu_perf_writer_finished, 0); + rcu_perf_print_module_parms(cur_ops, "Start of test"); + + /* Start up the kthreads. */ + + if (shutdown) { + init_waitqueue_head(&shutdown_wq); + firsterr = torture_create_kthread(rcu_perf_shutdown, NULL, + shutdown_task); + if (firsterr) + goto unwind; + schedule_timeout_uninterruptible(1); + } + reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), + GFP_KERNEL); + if (reader_tasks == NULL) { + VERBOSE_PERFOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealreaders; i++) { + firsterr = torture_create_kthread(rcu_perf_reader, (void *)i, + reader_tasks[i]); + if (firsterr) + goto unwind; + } + while (atomic_read(&n_rcu_perf_reader_started) < nrealreaders) + schedule_timeout_uninterruptible(1); + writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]), + GFP_KERNEL); + writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), + GFP_KERNEL); + writer_n_durations = + kcalloc(nrealwriters, sizeof(*writer_n_durations), + GFP_KERNEL); + if (!writer_tasks || !writer_durations || !writer_n_durations) { + VERBOSE_PERFOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealwriters; i++) { + writer_durations[i] = + kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), + GFP_KERNEL); + if (!writer_durations[i]) + goto unwind; + firsterr = torture_create_kthread(rcu_perf_writer, (void *)i, + writer_tasks[i]); + if (firsterr) + goto unwind; + } + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + rcu_perf_cleanup(); + return firsterr; +} + +module_init(rcu_perf_init); +module_exit(rcu_perf_cleanup); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 250ea67c1615..084a28a732eb 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -130,8 +130,8 @@ static struct rcu_torture __rcu *rcu_torture_current; static unsigned long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = { 0 }; -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count); +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch); static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; static atomic_t n_rcu_torture_alloc; static atomic_t n_rcu_torture_alloc_fail; @@ -916,7 +916,7 @@ rcu_torture_fqs(void *arg) static int rcu_torture_writer(void *arg) { - bool can_expedite = !rcu_gp_is_expedited(); + bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal(); int expediting = 0; unsigned long gp_snap; bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; @@ -932,7 +932,7 @@ rcu_torture_writer(void *arg) VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); if (!can_expedite) { pr_alert("%s" TORTURE_FLAG - " Grace periods expedited from boot/sysfs for %s,\n", + " GP expediting controlled from boot/sysfs for %s,\n", torture_type, cur_ops->name); pr_alert("%s" TORTURE_FLAG " Disabled dynamic grace-period expediting.\n", @@ -1082,17 +1082,6 @@ rcu_torture_fakewriter(void *arg) return 0; } -static void rcutorture_trace_dump(void) -{ - static atomic_t beenhere = ATOMIC_INIT(0); - - if (atomic_read(&beenhere)) - return; - if (atomic_xchg(&beenhere, 1) != 0) - return; - ftrace_dump(DUMP_ALL); -} - /* * RCU torture reader from timer handler. Dereferences rcu_torture_current, * incrementing the corresponding element of the pipeline array. The @@ -1142,7 +1131,7 @@ static void rcu_torture_timer(unsigned long unused) if (pipe_count > 1) { do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, started, completed); - rcutorture_trace_dump(); + rcu_ftrace_dump(DUMP_ALL); } __this_cpu_inc(rcu_torture_count[pipe_count]); completed = completed - started; @@ -1215,7 +1204,7 @@ rcu_torture_reader(void *arg) if (pipe_count > 1) { do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, started, completed); - rcutorture_trace_dump(); + rcu_ftrace_dump(DUMP_ALL); } __this_cpu_inc(rcu_torture_count[pipe_count]); completed = completed - started; @@ -1333,7 +1322,7 @@ rcu_torture_stats_print(void) rcu_torture_writer_state, gpnum, completed, flags); show_rcu_gp_kthreads(); - rcutorture_trace_dump(); + rcu_ftrace_dump(DUMP_ALL); } rtcv_snap = rcu_torture_current_version; } @@ -1489,7 +1478,9 @@ static int rcu_torture_barrier_cbs(void *arg) * The above smp_load_acquire() ensures barrier_phase load * is ordered before the folloiwng ->call(). */ + local_irq_disable(); /* Just to test no-irq call_rcu(). */ cur_ops->call(&rcu, rcu_torture_barrier_cbf); + local_irq_enable(); if (atomic_dec_and_test(&barrier_cbs_count)) wake_up(&barrier_wq); } while (!torture_must_stop()); @@ -1596,7 +1587,7 @@ static int rcutorture_cpu_notify(struct notifier_block *self, { long cpu = (long)hcpu; - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: case CPU_DOWN_FAILED: (void)rcutorture_booster_init(cpu); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9a535a86e732..c7f1bc4f817c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -102,6 +102,8 @@ struct rcu_state sname##_state = { \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ + .exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \ + .exp_wake_mutex = __MUTEX_INITIALIZER(sname##_state.exp_wake_mutex), \ } RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); @@ -370,6 +372,21 @@ void rcu_all_qs(void) rcu_momentary_dyntick_idle(); local_irq_restore(flags); } + if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) { + /* + * Yes, we just checked a per-CPU variable with preemption + * enabled, so we might be migrated to some other CPU at + * this point. That is OK because in that case, the + * migration will supply the needed quiescent state. + * We might end up needlessly disabling preemption and + * invoking rcu_sched_qs() on the destination CPU, but + * the probability and cost are both quite low, so this + * should not be a problem in practice. + */ + preempt_disable(); + rcu_sched_qs(); + preempt_enable(); + } this_cpu_inc(rcu_qs_ctr); barrier(); /* Avoid RCU read-side critical sections leaking up. */ } @@ -385,9 +402,11 @@ module_param(qlowmark, long, 0444); static ulong jiffies_till_first_fqs = ULONG_MAX; static ulong jiffies_till_next_fqs = ULONG_MAX; +static bool rcu_kick_kthreads; module_param(jiffies_till_first_fqs, ulong, 0644); module_param(jiffies_till_next_fqs, ulong, 0644); +module_param(rcu_kick_kthreads, bool, 0644); /* * How long the grace period must be before we start recruiting @@ -460,6 +479,28 @@ unsigned long rcu_batches_completed_bh(void) EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); /* + * Return the number of RCU expedited batches completed thus far for + * debug & stats. Odd numbers mean that a batch is in progress, even + * numbers mean idle. The value returned will thus be roughly double + * the cumulative batches since boot. + */ +unsigned long rcu_exp_batches_completed(void) +{ + return rcu_state_p->expedited_sequence; +} +EXPORT_SYMBOL_GPL(rcu_exp_batches_completed); + +/* + * Return the number of RCU-sched expedited batches completed thus far + * for debug & stats. Similar to rcu_exp_batches_completed(). + */ +unsigned long rcu_exp_batches_completed_sched(void) +{ + return rcu_sched_state.expedited_sequence; +} +EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched); + +/* * Force a quiescent state. */ void rcu_force_quiescent_state(void) @@ -637,7 +678,7 @@ static void rcu_eqs_enter_common(long long oldval, bool user) idle_task(smp_processor_id()); trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); - ftrace_dump(DUMP_ORIG); + rcu_ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ @@ -799,7 +840,7 @@ static void rcu_eqs_exit_common(long long oldval, int user) trace_rcu_dyntick(TPS("Error on exit: not idle task"), oldval, rdtp->dynticks_nesting); - ftrace_dump(DUMP_ORIG); + rcu_ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ @@ -1224,8 +1265,10 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) rsp->gp_flags, gp_state_getname(rsp->gp_state), rsp->gp_state, rsp->gp_kthread ? rsp->gp_kthread->state : ~0); - if (rsp->gp_kthread) + if (rsp->gp_kthread) { sched_show_task(rsp->gp_kthread); + wake_up_process(rsp->gp_kthread); + } } } @@ -1249,6 +1292,25 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) } } +/* + * If too much time has passed in the current grace period, and if + * so configured, go kick the relevant kthreads. + */ +static void rcu_stall_kick_kthreads(struct rcu_state *rsp) +{ + unsigned long j; + + if (!rcu_kick_kthreads) + return; + j = READ_ONCE(rsp->jiffies_kick_kthreads); + if (time_after(jiffies, j) && rsp->gp_kthread) { + WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name); + rcu_ftrace_dump(DUMP_ALL); + wake_up_process(rsp->gp_kthread); + WRITE_ONCE(rsp->jiffies_kick_kthreads, j + HZ); + } +} + static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) { int cpu; @@ -1260,6 +1322,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; + /* Kick and suppress, if so configured. */ + rcu_stall_kick_kthreads(rsp); + if (rcu_cpu_stall_suppress) + return; + /* Only let one CPU complain about others per time interval. */ raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -1333,6 +1400,11 @@ static void print_cpu_stall(struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; + /* Kick and suppress, if so configured. */ + rcu_stall_kick_kthreads(rsp); + if (rcu_cpu_stall_suppress) + return; + /* * OK, time to rat on ourselves... * See Documentation/RCU/stallwarn.txt for info on how to debug @@ -1377,8 +1449,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) unsigned long js; struct rcu_node *rnp; - if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp)) + if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) || + !rcu_gp_in_progress(rsp)) return; + rcu_stall_kick_kthreads(rsp); j = jiffies; /* @@ -2117,8 +2191,11 @@ static int __noreturn rcu_gp_kthread(void *arg) } ret = 0; for (;;) { - if (!ret) + if (!ret) { rsp->jiffies_force_qs = jiffies + j; + WRITE_ONCE(rsp->jiffies_kick_kthreads, + jiffies + 3 * j); + } trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("fqswait")); @@ -2144,6 +2221,15 @@ static int __noreturn rcu_gp_kthread(void *arg) TPS("fqsend")); cond_resched_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); + ret = 0; /* Force full wait till next FQS. */ + j = jiffies_till_next_fqs; + if (j > HZ) { + j = HZ; + jiffies_till_next_fqs = HZ; + } else if (j < 1) { + j = 1; + jiffies_till_next_fqs = 1; + } } else { /* Deal with stray signal. */ cond_resched_rcu_qs(); @@ -2152,14 +2238,12 @@ static int __noreturn rcu_gp_kthread(void *arg) trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("fqswaitsig")); - } - j = jiffies_till_next_fqs; - if (j > HZ) { - j = HZ; - jiffies_till_next_fqs = HZ; - } else if (j < 1) { - j = 1; - jiffies_till_next_fqs = 1; + ret = 1; /* Keep old FQS timing. */ + j = jiffies; + if (time_after(jiffies, rsp->jiffies_force_qs)) + j = 1; + else + j = rsp->jiffies_force_qs - j; } } @@ -3376,8 +3460,12 @@ static void rcu_exp_gp_seq_end(struct rcu_state *rsp) } static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) { + unsigned long s; + smp_mb(); /* Caller's modifications seen first by other CPUs. */ - return rcu_seq_snap(&rsp->expedited_sequence); + s = rcu_seq_snap(&rsp->expedited_sequence); + trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); + return s; } static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) { @@ -3469,7 +3557,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) * for the current expedited grace period. Works only for preemptible * RCU -- other RCU implementation use other means. * - * Caller must hold the root rcu_node's exp_funnel_mutex. + * Caller must hold the rcu_state's exp_mutex. */ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) { @@ -3485,8 +3573,8 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) * - * Caller must hold the root rcu_node's exp_funnel_mutex and the - * specified rcu_node structure's ->lock. + * Caller must hold the rcu_state's exp_mutex and the specified rcu_node + * structure's ->lock. */ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake, unsigned long flags) @@ -3523,7 +3611,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, * Report expedited quiescent state for specified node. This is a * lock-acquisition wrapper function for __rcu_report_exp_rnp(). * - * Caller must hold the root rcu_node's exp_funnel_mutex. + * Caller must hold the rcu_state's exp_mutex. */ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake) @@ -3536,8 +3624,8 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, /* * Report expedited quiescent state for multiple CPUs, all covered by the - * specified leaf rcu_node structure. Caller must hold the root - * rcu_node's exp_funnel_mutex. + * specified leaf rcu_node structure. Caller must hold the rcu_state's + * exp_mutex. */ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, unsigned long mask, bool wake) @@ -3555,7 +3643,6 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, /* * Report expedited quiescent state for specified rcu_data (CPU). - * Caller must hold the root rcu_node's exp_funnel_mutex. */ static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, bool wake) @@ -3564,15 +3651,11 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, } /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ -static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp, - atomic_long_t *stat, unsigned long s) +static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat, + unsigned long s) { if (rcu_exp_gp_seq_done(rsp, s)) { - if (rnp) - mutex_unlock(&rnp->exp_funnel_mutex); - else if (rdp) - mutex_unlock(&rdp->exp_funnel_mutex); + trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); /* Ensure test happens before caller kfree(). */ smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(stat); @@ -3582,59 +3665,65 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, } /* - * Funnel-lock acquisition for expedited grace periods. Returns a - * pointer to the root rcu_node structure, or NULL if some other - * task did the expedited grace period for us. + * Funnel-lock acquisition for expedited grace periods. Returns true + * if some other task completed an expedited grace period that this task + * can piggy-back on, and with no mutex held. Otherwise, returns false + * with the mutex held, indicating that the caller must actually do the + * expedited grace period. */ -static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) +static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) { struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); - struct rcu_node *rnp0; - struct rcu_node *rnp1 = NULL; + struct rcu_node *rnp = rdp->mynode; + struct rcu_node *rnp_root = rcu_get_root(rsp); + + /* Low-contention fastpath. */ + if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && + (rnp == rnp_root || + ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && + !mutex_is_locked(&rsp->exp_mutex) && + mutex_trylock(&rsp->exp_mutex)) + goto fastpath; /* - * First try directly acquiring the root lock in order to reduce - * latency in the common case where expedited grace periods are - * rare. We check mutex_is_locked() to avoid pathological levels of - * memory contention on ->exp_funnel_mutex in the heavy-load case. + * Each pass through the following loop works its way up + * the rcu_node tree, returning if others have done the work or + * otherwise falls through to acquire rsp->exp_mutex. The mapping + * from CPU to rcu_node structure can be inexact, as it is just + * promoting locality and is not strictly needed for correctness. */ - rnp0 = rcu_get_root(rsp); - if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) { - if (mutex_trylock(&rnp0->exp_funnel_mutex)) { - if (sync_exp_work_done(rsp, rnp0, NULL, - &rdp->expedited_workdone0, s)) - return NULL; - return rnp0; + for (; rnp != NULL; rnp = rnp->parent) { + if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s)) + return true; + + /* Work not done, either wait here or go up. */ + spin_lock(&rnp->exp_lock); + if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) { + + /* Someone else doing GP, so wait for them. */ + spin_unlock(&rnp->exp_lock); + trace_rcu_exp_funnel_lock(rsp->name, rnp->level, + rnp->grplo, rnp->grphi, + TPS("wait")); + wait_event(rnp->exp_wq[(s >> 1) & 0x3], + sync_exp_work_done(rsp, + &rdp->exp_workdone2, s)); + return true; } + rnp->exp_seq_rq = s; /* Followers can wait on us. */ + spin_unlock(&rnp->exp_lock); + trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, + rnp->grphi, TPS("nxtlvl")); } - - /* - * Each pass through the following loop works its way - * up the rcu_node tree, returning if others have done the - * work or otherwise falls through holding the root rnp's - * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure - * can be inexact, as it is just promoting locality and is not - * strictly needed for correctness. - */ - if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s)) - return NULL; - mutex_lock(&rdp->exp_funnel_mutex); - rnp0 = rdp->mynode; - for (; rnp0 != NULL; rnp0 = rnp0->parent) { - if (sync_exp_work_done(rsp, rnp1, rdp, - &rdp->expedited_workdone2, s)) - return NULL; - mutex_lock(&rnp0->exp_funnel_mutex); - if (rnp1) - mutex_unlock(&rnp1->exp_funnel_mutex); - else - mutex_unlock(&rdp->exp_funnel_mutex); - rnp1 = rnp0; + mutex_lock(&rsp->exp_mutex); +fastpath: + if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { + mutex_unlock(&rsp->exp_mutex); + return true; } - if (sync_exp_work_done(rsp, rnp1, rdp, - &rdp->expedited_workdone3, s)) - return NULL; - return rnp1; + rcu_exp_gp_seq_start(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); + return false; } /* Invoked on each online non-idle CPU for expedited quiescent state. */ @@ -3649,6 +3738,11 @@ static void sync_sched_exp_handler(void *data) if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) return; + if (rcu_is_cpu_rrupt_from_idle()) { + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(&rcu_sched_data), true); + return; + } __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); resched_cpu(smp_processor_id()); } @@ -3773,7 +3867,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) rsp->name); ndetected = 0; rcu_for_each_leaf_node(rsp, rnp) { - ndetected = rcu_print_task_exp_stall(rnp); + ndetected += rcu_print_task_exp_stall(rnp); mask = 1; for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { struct rcu_data *rdp; @@ -3783,7 +3877,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) ndetected++; rdp = per_cpu_ptr(rsp->rda, cpu); pr_cont(" %d-%c%c%c", cpu, - "O."[cpu_online(cpu)], + "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rnp->expmaskinit)], "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); } @@ -3792,7 +3886,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", jiffies - jiffies_start, rsp->expedited_sequence, rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); - if (!ndetected) { + if (ndetected) { pr_err("blocking rcu_node structures:"); rcu_for_each_node_breadth_first(rsp, rnp) { if (rnp == rnp_root) @@ -3818,6 +3912,41 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) } } +/* + * Wait for the current expedited grace period to complete, and then + * wake up everyone who piggybacked on the just-completed expedited + * grace period. Also update all the ->exp_seq_rq counters as needed + * in order to avoid counter-wrap problems. + */ +static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) +{ + struct rcu_node *rnp; + + synchronize_sched_expedited_wait(rsp); + rcu_exp_gp_seq_end(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); + + /* + * Switch over to wakeup mode, allowing the next GP, but -only- the + * next GP, to proceed. + */ + mutex_lock(&rsp->exp_wake_mutex); + mutex_unlock(&rsp->exp_mutex); + + rcu_for_each_node_breadth_first(rsp, rnp) { + if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { + spin_lock(&rnp->exp_lock); + /* Recheck, avoid hang in case someone just arrived. */ + if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) + rnp->exp_seq_rq = s; + spin_unlock(&rnp->exp_lock); + } + wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); + } + trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); + mutex_unlock(&rsp->exp_wake_mutex); +} + /** * synchronize_sched_expedited - Brute-force RCU-sched grace period * @@ -3837,7 +3966,6 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) void synchronize_sched_expedited(void) { unsigned long s; - struct rcu_node *rnp; struct rcu_state *rsp = &rcu_sched_state; /* If only one CPU, this is automatically a grace period. */ @@ -3852,17 +3980,14 @@ void synchronize_sched_expedited(void) /* Take a snapshot of the sequence number. */ s = rcu_exp_gp_seq_snap(rsp); - - rnp = exp_funnel_lock(rsp, s); - if (rnp == NULL) + if (exp_funnel_lock(rsp, s)) return; /* Someone else did our work for us. */ - rcu_exp_gp_seq_start(rsp); + /* Initialize the rcu_node tree in preparation for the wait. */ sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); - synchronize_sched_expedited_wait(rsp); - rcu_exp_gp_seq_end(rsp); - mutex_unlock(&rnp->exp_funnel_mutex); + /* Wait and clean up, including waking everyone. */ + rcu_exp_wait_wake(rsp, s); } EXPORT_SYMBOL_GPL(synchronize_sched_expedited); @@ -4162,7 +4287,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); rdp->cpu = cpu; rdp->rsp = rsp; - mutex_init(&rdp->exp_funnel_mutex); rcu_boot_init_nocb_percpu_data(rdp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } @@ -4420,10 +4544,8 @@ static void __init rcu_init_one(struct rcu_state *rsp) { static const char * const buf[] = RCU_NODE_NAME_INIT; static const char * const fqs[] = RCU_FQS_NAME_INIT; - static const char * const exp[] = RCU_EXP_NAME_INIT; static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; - static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; static u8 fl_mask = 0x1; int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ @@ -4482,9 +4604,11 @@ static void __init rcu_init_one(struct rcu_state *rsp) rnp->level = i; INIT_LIST_HEAD(&rnp->blkd_tasks); rcu_init_one_nocb(rnp); - mutex_init(&rnp->exp_funnel_mutex); - lockdep_set_class_and_name(&rnp->exp_funnel_mutex, - &rcu_exp_class[i], exp[i]); + init_waitqueue_head(&rnp->exp_wq[0]); + init_waitqueue_head(&rnp->exp_wq[1]); + init_waitqueue_head(&rnp->exp_wq[2]); + init_waitqueue_head(&rnp->exp_wq[3]); + spin_lock_init(&rnp->exp_lock); } } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index df668c0f9e64..e3959f5e6ddf 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -70,7 +70,6 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } # define RCU_NODE_NAME_INIT { "rcu_node_0" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } -# define RCU_EXP_NAME_INIT { "rcu_node_exp_0" } #elif NR_CPUS <= RCU_FANOUT_2 # define RCU_NUM_LVLS 2 # define NUM_RCU_LVL_0 1 @@ -79,7 +78,6 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } -# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" } #elif NR_CPUS <= RCU_FANOUT_3 # define RCU_NUM_LVLS 3 # define NUM_RCU_LVL_0 1 @@ -89,7 +87,6 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } -# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" } #elif NR_CPUS <= RCU_FANOUT_4 # define RCU_NUM_LVLS 4 # define NUM_RCU_LVL_0 1 @@ -100,7 +97,6 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } -# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" } #else # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ @@ -252,7 +248,9 @@ struct rcu_node { /* Counts of upcoming no-CB GP requests. */ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; - struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp; + spinlock_t exp_lock ____cacheline_internodealigned_in_smp; + unsigned long exp_seq_rq; + wait_queue_head_t exp_wq[4]; } ____cacheline_internodealigned_in_smp; /* @@ -387,11 +385,9 @@ struct rcu_data { #ifdef CONFIG_RCU_FAST_NO_HZ struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ - struct mutex exp_funnel_mutex; - atomic_long_t expedited_workdone0; /* # done by others #0. */ - atomic_long_t expedited_workdone1; /* # done by others #1. */ - atomic_long_t expedited_workdone2; /* # done by others #2. */ - atomic_long_t expedited_workdone3; /* # done by others #3. */ + atomic_long_t exp_workdone1; /* # done by others #1. */ + atomic_long_t exp_workdone2; /* # done by others #2. */ + atomic_long_t exp_workdone3; /* # done by others #3. */ /* 7) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU @@ -505,6 +501,8 @@ struct rcu_state { /* _rcu_barrier(). */ /* End of fields guarded by barrier_mutex. */ + struct mutex exp_mutex; /* Serialize expedited GP. */ + struct mutex exp_wake_mutex; /* Serialize wakeup. */ unsigned long expedited_sequence; /* Take a ticket. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */ atomic_t expedited_need_qs; /* # CPUs left to check in. */ @@ -513,6 +511,8 @@ struct rcu_state { unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ + unsigned long jiffies_kick_kthreads; /* Time at which to kick */ + /* kthreads, if configured. */ unsigned long n_force_qs; /* Number of calls to */ /* force_quiescent_state(). */ unsigned long n_force_qs_lh; /* ~Number of calls leaving */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index efdf7b61ce12..ff1cd4e1188d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -722,18 +722,22 @@ static void sync_rcu_exp_handler(void *info) * synchronize_rcu_expedited - Brute-force RCU grace period * * Wait for an RCU-preempt grace period, but expedite it. The basic - * idea is to invoke synchronize_sched_expedited() to push all the tasks to - * the ->blkd_tasks lists and wait for this list to drain. This consumes - * significant time on all CPUs and is unfriendly to real-time workloads, - * so is thus not recommended for any sort of common-case code. - * In fact, if you are using synchronize_rcu_expedited() in a loop, - * please restructure your code to batch your updates, and then Use a - * single synchronize_rcu() instead. + * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler + * checks whether the CPU is in an RCU-preempt critical section, and + * if so, it sets a flag that causes the outermost rcu_read_unlock() + * to report the quiescent state. On the other hand, if the CPU is + * not in an RCU read-side critical section, the IPI handler reports + * the quiescent state immediately. + * + * Although this is a greate improvement over previous expedited + * implementations, it is still unfriendly to real-time workloads, so is + * thus not recommended for any sort of common-case code. In fact, if + * you are using synchronize_rcu_expedited() in a loop, please restructure + * your code to batch your updates, and then Use a single synchronize_rcu() + * instead. */ void synchronize_rcu_expedited(void) { - struct rcu_node *rnp; - struct rcu_node *rnp_unlock; struct rcu_state *rsp = rcu_state_p; unsigned long s; @@ -744,23 +748,14 @@ void synchronize_rcu_expedited(void) } s = rcu_exp_gp_seq_snap(rsp); - - rnp_unlock = exp_funnel_lock(rsp, s); - if (rnp_unlock == NULL) + if (exp_funnel_lock(rsp, s)) return; /* Someone else did our work for us. */ - rcu_exp_gp_seq_start(rsp); - /* Initialize the rcu_node tree in preparation for the wait. */ sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); - /* Wait for snapshotted ->blkd_tasks lists to drain. */ - rnp = rcu_get_root(rsp); - synchronize_sched_expedited_wait(rsp); - - /* Clean up and exit. */ - rcu_exp_gp_seq_end(rsp); - mutex_unlock(&rnp_unlock->exp_funnel_mutex); + /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ + rcu_exp_wait_wake(rsp, s); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 1088e64f01ad..86782f9a4604 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -185,17 +185,16 @@ static int show_rcuexp(struct seq_file *m, void *v) int cpu; struct rcu_state *rsp = (struct rcu_state *)m->private; struct rcu_data *rdp; - unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0; + unsigned long s1 = 0, s2 = 0, s3 = 0; for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(rsp->rda, cpu); - s0 += atomic_long_read(&rdp->expedited_workdone0); - s1 += atomic_long_read(&rdp->expedited_workdone1); - s2 += atomic_long_read(&rdp->expedited_workdone2); - s3 += atomic_long_read(&rdp->expedited_workdone3); + s1 += atomic_long_read(&rdp->exp_workdone1); + s2 += atomic_long_read(&rdp->exp_workdone2); + s3 += atomic_long_read(&rdp->exp_workdone3); } - seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", - rsp->expedited_sequence, s0, s1, s2, s3, + seq_printf(m, "s=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", + rsp->expedited_sequence, s1, s2, s3, atomic_long_read(&rsp->expedited_normal), atomic_read(&rsp->expedited_need_qs), rsp->expedited_sequence / 2); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index ca828b41c938..3ccdc8eebc5a 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -67,7 +67,7 @@ static int rcu_normal_after_boot; module_param(rcu_normal_after_boot, int, 0); #endif /* #ifndef CONFIG_TINY_RCU */ -#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT) +#ifdef CONFIG_DEBUG_LOCK_ALLOC /** * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? * @@ -111,7 +111,7 @@ int rcu_read_lock_sched_held(void) return 0; if (debug_locks) lockdep_opinion = lock_is_held(&rcu_sched_lock_map); - return lockdep_opinion || preempt_count() != 0 || irqs_disabled(); + return lockdep_opinion || !preemptible(); } EXPORT_SYMBOL(rcu_read_lock_sched_held); #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 40748dc8ea3e..e7dd0ec169be 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3188,25 +3188,17 @@ static inline void check_schedstat_required(void) static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING); - bool curr = cfs_rq->curr == se; - /* - * If we're the current task, we must renormalise before calling - * update_curr(). + * Update the normalized vruntime before updating min_vruntime + * through calling update_curr(). */ - if (renorm && curr) + if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) se->vruntime += cfs_rq->min_vruntime; - update_curr(cfs_rq); - /* - * Otherwise, renormalise after, such that we're placed at the current - * moment in time, instead of some random moment in the past. + * Update run-time statistics of the 'current'. */ - if (renorm && !curr) - se->vruntime += cfs_rq->min_vruntime; - + update_curr(cfs_rq); enqueue_entity_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -3222,7 +3214,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_stats_enqueue(cfs_rq, se); check_spread(cfs_rq, se); } - if (!curr) + if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; diff --git a/kernel/signal.c b/kernel/signal.c index aa9bf00749c1..ab122a2cee41 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3099,12 +3099,14 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s oss.ss_sp = (void __user *) current->sas_ss_sp; oss.ss_size = current->sas_ss_size; - oss.ss_flags = sas_ss_flags(sp); + oss.ss_flags = sas_ss_flags(sp) | + (current->sas_ss_flags & SS_FLAG_BITS); if (uss) { void __user *ss_sp; size_t ss_size; - int ss_flags; + unsigned ss_flags; + int ss_mode; error = -EFAULT; if (!access_ok(VERIFY_READ, uss, sizeof(*uss))) @@ -3119,18 +3121,13 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s if (on_sig_stack(sp)) goto out; + ss_mode = ss_flags & ~SS_FLAG_BITS; error = -EINVAL; - /* - * Note - this code used to test ss_flags incorrectly: - * old code may have been written using ss_flags==0 - * to mean ss_flags==SS_ONSTACK (as this was the only - * way that worked) - this fix preserves that older - * mechanism. - */ - if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0) + if (ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK && + ss_mode != 0) goto out; - if (ss_flags == SS_DISABLE) { + if (ss_mode == SS_DISABLE) { ss_size = 0; ss_sp = NULL; } else { @@ -3141,6 +3138,7 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s current->sas_ss_sp = (unsigned long) ss_sp; current->sas_ss_size = ss_size; + current->sas_ss_flags = ss_flags; } error = 0; @@ -3171,9 +3169,14 @@ int restore_altstack(const stack_t __user *uss) int __save_altstack(stack_t __user *uss, unsigned long sp) { struct task_struct *t = current; - return __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) | - __put_user(sas_ss_flags(sp), &uss->ss_flags) | + int err = __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) | + __put_user(t->sas_ss_flags, &uss->ss_flags) | __put_user(t->sas_ss_size, &uss->ss_size); + if (err) + return err; + if (t->sas_ss_flags & SS_AUTODISARM) + sas_ss_reset(t); + return 0; } #ifdef CONFIG_COMPAT diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 58e3310c9b21..3daa49ff0719 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -262,7 +262,7 @@ static void tick_nohz_dep_set_all(atomic_t *dep, { int prev; - prev = atomic_fetch_or(dep, BIT(bit)); + prev = atomic_fetch_or(BIT(bit), dep); if (!prev) tick_nohz_full_kick_all(); } @@ -292,7 +292,7 @@ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) ts = per_cpu_ptr(&tick_cpu_sched, cpu); - prev = atomic_fetch_or(&ts->tick_dep_mask, BIT(bit)); + prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask); if (!prev) { preempt_disable(); /* Perf needs local kick that is NMI safe */ diff --git a/kernel/torture.c b/kernel/torture.c index 44aa462d033f..fa0bdeee17ac 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -451,6 +451,7 @@ static int torture_shutdown(void *arg) torture_shutdown_hook(); else VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping."); + ftrace_dump(DUMP_ALL); kernel_power_off(); /* Shut down the system. */ return 0; } @@ -602,8 +603,9 @@ bool torture_init_begin(char *ttype, bool v, int *runnable) { mutex_lock(&fullstop_mutex); if (torture_type != NULL) { - pr_alert("torture_init_begin: refusing %s init: %s running", + pr_alert("torture_init_begin: Refusing %s init: %s running.\n", ttype, torture_type); + pr_alert("torture_init_begin: One torture test at a time!\n"); mutex_unlock(&fullstop_mutex); return false; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3bfdff06eea7..5f5068e94003 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4554,6 +4554,17 @@ static void rebind_workers(struct worker_pool *pool) pool->attrs->cpumask) < 0); spin_lock_irq(&pool->lock); + + /* + * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED + * w/o preceding DOWN_PREPARE. Work around it. CPU hotplug is + * being reworked and this can go away in time. + */ + if (!(pool->flags & POOL_DISASSOCIATED)) { + spin_unlock_irq(&pool->lock); + return; + } + pool->flags &= ~POOL_DISASSOCIATED; for_each_pool_worker(worker, pool) { diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 1e9a607534ca..f4b797a690ba 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1289,6 +1289,39 @@ config TORTURE_TEST tristate default n +config RCU_PERF_TEST + tristate "performance tests for RCU" + depends on DEBUG_KERNEL + select TORTURE_TEST + select SRCU + select TASKS_RCU + default n + help + This option provides a kernel module that runs performance + tests on the RCU infrastructure. The kernel module may be built + after the fact on the running kernel to be tested, if desired. + + Say Y here if you want RCU performance tests to be built into + the kernel. + Say M if you want the RCU performance tests to build as a module. + Say N if you are unsure. + +config RCU_PERF_TEST_RUNNABLE + bool "performance tests for RCU runnable by default" + depends on RCU_PERF_TEST = y + default n + help + This option provides a way to build the RCU performance tests + directly into the kernel without them starting up at boot time. + You can use /sys/module to manually override this setting. + This /proc file is available only when the RCU performance + tests have been built into the kernel. + + Say Y here if you want the RCU performance tests to start during + boot (you probably don't). + Say N here if you want the RCU performance tests to start only + after being manually enabled via /sys/module. + config RCU_TORTURE_TEST tristate "torture tests for RCU" depends on DEBUG_KERNEL diff --git a/lib/Makefile b/lib/Makefile index 7bd6fd436c97..a65e9a861535 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -23,7 +23,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o dump_stack.o timerqueue.o\ idr.o int_sqrt.o extable.o \ sha1.o md5.o irq_regs.o argv_split.o \ - proportions.o flex_proportions.o ratelimit.o show_mem.o \ + flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o seq_buf.o nmi_backtrace.o diff --git a/lib/asn1_decoder.c b/lib/asn1_decoder.c index 2b3f46c049d4..554522934c44 100644 --- a/lib/asn1_decoder.c +++ b/lib/asn1_decoder.c @@ -74,7 +74,7 @@ next_tag: /* Extract a tag from the data */ tag = data[dp++]; - if (tag == 0) { + if (tag == ASN1_EOC) { /* It appears to be an EOC. */ if (data[dp++] != 0) goto invalid_eoc; @@ -96,10 +96,8 @@ next_tag: /* Extract the length */ len = data[dp++]; - if (len <= 0x7f) { - dp += len; - goto next_tag; - } + if (len <= 0x7f) + goto check_length; if (unlikely(len == ASN1_INDEFINITE_LENGTH)) { /* Indefinite length */ @@ -110,14 +108,18 @@ next_tag: } n = len - 0x80; - if (unlikely(n > sizeof(size_t) - 1)) + if (unlikely(n > sizeof(len) - 1)) goto length_too_long; if (unlikely(n > datalen - dp)) goto data_overrun_error; - for (len = 0; n > 0; n--) { + len = 0; + for (; n > 0; n--) { len <<= 8; len |= data[dp++]; } +check_length: + if (len > datalen - dp) + goto data_overrun_error; dp += len; goto next_tag; diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 5fecddc32b1b..ca5316e0087b 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -569,6 +569,25 @@ unsigned long iov_iter_alignment(const struct iov_iter *i) } EXPORT_SYMBOL(iov_iter_alignment); +unsigned long iov_iter_gap_alignment(const struct iov_iter *i) +{ + unsigned long res = 0; + size_t size = i->count; + if (!size) + return 0; + + iterate_all_kinds(i, size, v, + (res |= (!res ? 0 : (unsigned long)v.iov_base) | + (size != v.iov_len ? size : 0), 0), + (res |= (!res ? 0 : (unsigned long)v.bv_offset) | + (size != v.bv_len ? size : 0)), + (res |= (!res ? 0 : (unsigned long)v.iov_base) | + (size != v.iov_len ? size : 0)) + ); + return res; +} +EXPORT_SYMBOL(iov_iter_gap_alignment); + ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start) diff --git a/lib/proportions.c b/lib/proportions.c deleted file mode 100644 index efa54f259ea9..000000000000 --- a/lib/proportions.c +++ /dev/null @@ -1,407 +0,0 @@ -/* - * Floating proportions - * - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra - * - * Description: - * - * The floating proportion is a time derivative with an exponentially decaying - * history: - * - * p_{j} = \Sum_{i=0} (dx_{j}/dt_{-i}) / 2^(1+i) - * - * Where j is an element from {prop_local}, x_{j} is j's number of events, - * and i the time period over which the differential is taken. So d/dt_{-i} is - * the differential over the i-th last period. - * - * The decaying history gives smooth transitions. The time differential carries - * the notion of speed. - * - * The denominator is 2^(1+i) because we want the series to be normalised, ie. - * - * \Sum_{i=0} 1/2^(1+i) = 1 - * - * Further more, if we measure time (t) in the same events as x; so that: - * - * t = \Sum_{j} x_{j} - * - * we get that: - * - * \Sum_{j} p_{j} = 1 - * - * Writing this in an iterative fashion we get (dropping the 'd's): - * - * if (++x_{j}, ++t > period) - * t /= 2; - * for_each (j) - * x_{j} /= 2; - * - * so that: - * - * p_{j} = x_{j} / t; - * - * We optimize away the '/= 2' for the global time delta by noting that: - * - * if (++t > period) t /= 2: - * - * Can be approximated by: - * - * period/2 + (++t % period/2) - * - * [ Furthermore, when we choose period to be 2^n it can be written in terms of - * binary operations and wraparound artefacts disappear. ] - * - * Also note that this yields a natural counter of the elapsed periods: - * - * c = t / (period/2) - * - * [ Its monotonic increasing property can be applied to mitigate the wrap- - * around issue. ] - * - * This allows us to do away with the loop over all prop_locals on each period - * expiration. By remembering the period count under which it was last accessed - * as c_{j}, we can obtain the number of 'missed' cycles from: - * - * c - c_{j} - * - * We can then lazily catch up to the global period count every time we are - * going to use x_{j}, by doing: - * - * x_{j} /= 2^(c - c_{j}), c_{j} = c - */ - -#include <linux/proportions.h> -#include <linux/rcupdate.h> - -int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp) -{ - int err; - - if (shift > PROP_MAX_SHIFT) - shift = PROP_MAX_SHIFT; - - pd->index = 0; - pd->pg[0].shift = shift; - mutex_init(&pd->mutex); - err = percpu_counter_init(&pd->pg[0].events, 0, gfp); - if (err) - goto out; - - err = percpu_counter_init(&pd->pg[1].events, 0, gfp); - if (err) - percpu_counter_destroy(&pd->pg[0].events); - -out: - return err; -} - -/* - * We have two copies, and flip between them to make it seem like an atomic - * update. The update is not really atomic wrt the events counter, but - * it is internally consistent with the bit layout depending on shift. - * - * We copy the events count, move the bits around and flip the index. - */ -void prop_change_shift(struct prop_descriptor *pd, int shift) -{ - int index; - int offset; - u64 events; - unsigned long flags; - - if (shift > PROP_MAX_SHIFT) - shift = PROP_MAX_SHIFT; - - mutex_lock(&pd->mutex); - - index = pd->index ^ 1; - offset = pd->pg[pd->index].shift - shift; - if (!offset) - goto out; - - pd->pg[index].shift = shift; - - local_irq_save(flags); - events = percpu_counter_sum(&pd->pg[pd->index].events); - if (offset < 0) - events <<= -offset; - else - events >>= offset; - percpu_counter_set(&pd->pg[index].events, events); - - /* - * ensure the new pg is fully written before the switch - */ - smp_wmb(); - pd->index = index; - local_irq_restore(flags); - - synchronize_rcu(); - -out: - mutex_unlock(&pd->mutex); -} - -/* - * wrap the access to the data in an rcu_read_lock() section; - * this is used to track the active references. - */ -static struct prop_global *prop_get_global(struct prop_descriptor *pd) -__acquires(RCU) -{ - int index; - - rcu_read_lock(); - index = pd->index; - /* - * match the wmb from vcd_flip() - */ - smp_rmb(); - return &pd->pg[index]; -} - -static void prop_put_global(struct prop_descriptor *pd, struct prop_global *pg) -__releases(RCU) -{ - rcu_read_unlock(); -} - -static void -prop_adjust_shift(int *pl_shift, unsigned long *pl_period, int new_shift) -{ - int offset = *pl_shift - new_shift; - - if (!offset) - return; - - if (offset < 0) - *pl_period <<= -offset; - else - *pl_period >>= offset; - - *pl_shift = new_shift; -} - -/* - * PERCPU - */ - -#define PROP_BATCH (8*(1+ilog2(nr_cpu_ids))) - -int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp) -{ - raw_spin_lock_init(&pl->lock); - pl->shift = 0; - pl->period = 0; - return percpu_counter_init(&pl->events, 0, gfp); -} - -void prop_local_destroy_percpu(struct prop_local_percpu *pl) -{ - percpu_counter_destroy(&pl->events); -} - -/* - * Catch up with missed period expirations. - * - * until (c_{j} == c) - * x_{j} -= x_{j}/2; - * c_{j}++; - */ -static -void prop_norm_percpu(struct prop_global *pg, struct prop_local_percpu *pl) -{ - unsigned long period = 1UL << (pg->shift - 1); - unsigned long period_mask = ~(period - 1); - unsigned long global_period; - unsigned long flags; - - global_period = percpu_counter_read(&pg->events); - global_period &= period_mask; - - /* - * Fast path - check if the local and global period count still match - * outside of the lock. - */ - if (pl->period == global_period) - return; - - raw_spin_lock_irqsave(&pl->lock, flags); - prop_adjust_shift(&pl->shift, &pl->period, pg->shift); - - /* - * For each missed period, we half the local counter. - * basically: - * pl->events >> (global_period - pl->period); - */ - period = (global_period - pl->period) >> (pg->shift - 1); - if (period < BITS_PER_LONG) { - s64 val = percpu_counter_read(&pl->events); - - if (val < (nr_cpu_ids * PROP_BATCH)) - val = percpu_counter_sum(&pl->events); - - __percpu_counter_add(&pl->events, -val + (val >> period), - PROP_BATCH); - } else - percpu_counter_set(&pl->events, 0); - - pl->period = global_period; - raw_spin_unlock_irqrestore(&pl->lock, flags); -} - -/* - * ++x_{j}, ++t - */ -void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl) -{ - struct prop_global *pg = prop_get_global(pd); - - prop_norm_percpu(pg, pl); - __percpu_counter_add(&pl->events, 1, PROP_BATCH); - percpu_counter_add(&pg->events, 1); - prop_put_global(pd, pg); -} - -/* - * identical to __prop_inc_percpu, except that it limits this pl's fraction to - * @frac/PROP_FRAC_BASE by ignoring events when this limit has been exceeded. - */ -void __prop_inc_percpu_max(struct prop_descriptor *pd, - struct prop_local_percpu *pl, long frac) -{ - struct prop_global *pg = prop_get_global(pd); - - prop_norm_percpu(pg, pl); - - if (unlikely(frac != PROP_FRAC_BASE)) { - unsigned long period_2 = 1UL << (pg->shift - 1); - unsigned long counter_mask = period_2 - 1; - unsigned long global_count; - long numerator, denominator; - - numerator = percpu_counter_read_positive(&pl->events); - global_count = percpu_counter_read(&pg->events); - denominator = period_2 + (global_count & counter_mask); - - if (numerator > ((denominator * frac) >> PROP_FRAC_SHIFT)) - goto out_put; - } - - percpu_counter_add(&pl->events, 1); - percpu_counter_add(&pg->events, 1); - -out_put: - prop_put_global(pd, pg); -} - -/* - * Obtain a fraction of this proportion - * - * p_{j} = x_{j} / (period/2 + t % period/2) - */ -void prop_fraction_percpu(struct prop_descriptor *pd, - struct prop_local_percpu *pl, - long *numerator, long *denominator) -{ - struct prop_global *pg = prop_get_global(pd); - unsigned long period_2 = 1UL << (pg->shift - 1); - unsigned long counter_mask = period_2 - 1; - unsigned long global_count; - - prop_norm_percpu(pg, pl); - *numerator = percpu_counter_read_positive(&pl->events); - - global_count = percpu_counter_read(&pg->events); - *denominator = period_2 + (global_count & counter_mask); - - prop_put_global(pd, pg); -} - -/* - * SINGLE - */ - -int prop_local_init_single(struct prop_local_single *pl) -{ - raw_spin_lock_init(&pl->lock); - pl->shift = 0; - pl->period = 0; - pl->events = 0; - return 0; -} - -void prop_local_destroy_single(struct prop_local_single *pl) -{ -} - -/* - * Catch up with missed period expirations. - */ -static -void prop_norm_single(struct prop_global *pg, struct prop_local_single *pl) -{ - unsigned long period = 1UL << (pg->shift - 1); - unsigned long period_mask = ~(period - 1); - unsigned long global_period; - unsigned long flags; - - global_period = percpu_counter_read(&pg->events); - global_period &= period_mask; - - /* - * Fast path - check if the local and global period count still match - * outside of the lock. - */ - if (pl->period == global_period) - return; - - raw_spin_lock_irqsave(&pl->lock, flags); - prop_adjust_shift(&pl->shift, &pl->period, pg->shift); - /* - * For each missed period, we half the local counter. - */ - period = (global_period - pl->period) >> (pg->shift - 1); - if (likely(period < BITS_PER_LONG)) - pl->events >>= period; - else - pl->events = 0; - pl->period = global_period; - raw_spin_unlock_irqrestore(&pl->lock, flags); -} - -/* - * ++x_{j}, ++t - */ -void __prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl) -{ - struct prop_global *pg = prop_get_global(pd); - - prop_norm_single(pg, pl); - pl->events++; - percpu_counter_add(&pg->events, 1); - prop_put_global(pd, pg); -} - -/* - * Obtain a fraction of this proportion - * - * p_{j} = x_{j} / (period/2 + t % period/2) - */ -void prop_fraction_single(struct prop_descriptor *pd, - struct prop_local_single *pl, - long *numerator, long *denominator) -{ - struct prop_global *pg = prop_get_global(pd); - unsigned long period_2 = 1UL << (pg->shift - 1); - unsigned long counter_mask = period_2 - 1; - unsigned long global_count; - - prop_norm_single(pg, pl); - *numerator = pl->events; - - global_count = percpu_counter_read(&pg->events); - *denominator = period_2 + (global_count & counter_mask); - - prop_put_global(pd, pg); -} diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f7daa7de8f48..b49ee126d4d1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1298,15 +1298,9 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); /* * We can only reuse the page if nobody else maps the huge page or it's - * part. We can do it by checking page_mapcount() on each sub-page, but - * it's expensive. - * The cheaper way is to check page_count() to be equal 1: every - * mapcount takes page reference reference, so this way we can - * guarantee, that the PMD is the only mapping. - * This can give false negative if somebody pinned the page, but that's - * fine. + * part. */ - if (page_mapcount(page) == 1 && page_count(page) == 1) { + if (page_trans_huge_mapcount(page, NULL) == 1) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); @@ -2079,7 +2073,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, if (pte_write(pteval)) { writable = true; } else { - if (PageSwapCache(page) && !reuse_swap_page(page)) { + if (PageSwapCache(page) && + !reuse_swap_page(page, NULL)) { unlock_page(page); result = SCAN_SWAP_CACHE_PAGE; goto out; @@ -3223,6 +3218,64 @@ int total_mapcount(struct page *page) } /* + * This calculates accurately how many mappings a transparent hugepage + * has (unlike page_mapcount() which isn't fully accurate). This full + * accuracy is primarily needed to know if copy-on-write faults can + * reuse the page and change the mapping to read-write instead of + * copying them. At the same time this returns the total_mapcount too. + * + * The function returns the highest mapcount any one of the subpages + * has. If the return value is one, even if different processes are + * mapping different subpages of the transparent hugepage, they can + * all reuse it, because each process is reusing a different subpage. + * + * The total_mapcount is instead counting all virtual mappings of the + * subpages. If the total_mapcount is equal to "one", it tells the + * caller all mappings belong to the same "mm" and in turn the + * anon_vma of the transparent hugepage can become the vma->anon_vma + * local one as no other process may be mapping any of the subpages. + * + * It would be more accurate to replace page_mapcount() with + * page_trans_huge_mapcount(), however we only use + * page_trans_huge_mapcount() in the copy-on-write faults where we + * need full accuracy to avoid breaking page pinning, because + * page_trans_huge_mapcount() is slower than page_mapcount(). + */ +int page_trans_huge_mapcount(struct page *page, int *total_mapcount) +{ + int i, ret, _total_mapcount, mapcount; + + /* hugetlbfs shouldn't call it */ + VM_BUG_ON_PAGE(PageHuge(page), page); + + if (likely(!PageTransCompound(page))) { + mapcount = atomic_read(&page->_mapcount) + 1; + if (total_mapcount) + *total_mapcount = mapcount; + return mapcount; + } + + page = compound_head(page); + + _total_mapcount = ret = 0; + for (i = 0; i < HPAGE_PMD_NR; i++) { + mapcount = atomic_read(&page[i]._mapcount) + 1; + ret = max(ret, mapcount); + _total_mapcount += mapcount; + } + if (PageDoubleMap(page)) { + ret -= 1; + _total_mapcount -= HPAGE_PMD_NR; + } + mapcount = compound_mapcount(page); + ret += mapcount; + _total_mapcount += mapcount; + if (total_mapcount) + *total_mapcount = _total_mapcount; + return ret; +} + +/* * This function splits huge page into normal pages. @page can point to any * subpage of huge page to split. Split doesn't change the position of @page. * @@ -783,6 +783,7 @@ static int unmerge_and_remove_all_rmap_items(void) } remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list); + up_read(&mm->mmap_sem); spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, @@ -794,12 +795,9 @@ static int unmerge_and_remove_all_rmap_items(void) free_mm_slot(mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); - up_read(&mm->mmap_sem); mmdrop(mm); - } else { + } else spin_unlock(&ksm_mmlist_lock); - up_read(&mm->mmap_sem); - } } /* Clean up stable nodes, but don't worry if some are still busy */ @@ -1663,8 +1661,15 @@ next_mm: up_read(&mm->mmap_sem); mmdrop(mm); } else { - spin_unlock(&ksm_mmlist_lock); up_read(&mm->mmap_sem); + /* + * up_read(&mm->mmap_sem) first because after + * spin_unlock(&ksm_mmlist_lock) run, the "mm" may + * already have been freed under us by __ksm_exit() + * because the "mm_slot" is still hashed and + * ksm_scan.mm_slot doesn't point to it anymore. + */ + spin_unlock(&ksm_mmlist_lock); } /* Repeat until we've completed scanning the whole list */ diff --git a/mm/memory.c b/mm/memory.c index 52c218e2b724..07493e34ab7e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2373,6 +2373,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * not dirty accountable. */ if (PageAnon(old_page) && !PageKsm(old_page)) { + int total_mapcount; if (!trylock_page(old_page)) { get_page(old_page); pte_unmap_unlock(page_table, ptl); @@ -2387,13 +2388,18 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, } put_page(old_page); } - if (reuse_swap_page(old_page)) { - /* - * The page is all ours. Move it to our anon_vma so - * the rmap code will not search our parent or siblings. - * Protected against the rmap code by the page lock. - */ - page_move_anon_rmap(old_page, vma, address); + if (reuse_swap_page(old_page, &total_mapcount)) { + if (total_mapcount == 1) { + /* + * The page is all ours. Move it to + * our anon_vma so the rmap code will + * not search our parent or siblings. + * Protected against the rmap code by + * the page lock. + */ + page_move_anon_rmap(compound_head(old_page), + vma, address); + } unlock_page(old_page); return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte, old_page, 0, 0); @@ -2617,7 +2623,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, inc_mm_counter_fast(mm, MM_ANONPAGES); dec_mm_counter_fast(mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); - if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { + if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; diff --git a/mm/swapfile.c b/mm/swapfile.c index 83874eced5bf..031713ab40ce 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -922,18 +922,19 @@ out: * to it. And as a side-effect, free up its swap: because the old content * on disk will never be read, and seeking back there to write new content * later would only waste time away from clustering. + * + * NOTE: total_mapcount should not be relied upon by the caller if + * reuse_swap_page() returns false, but it may be always overwritten + * (see the other implementation for CONFIG_SWAP=n). */ -int reuse_swap_page(struct page *page) +bool reuse_swap_page(struct page *page, int *total_mapcount) { int count; VM_BUG_ON_PAGE(!PageLocked(page), page); if (unlikely(PageKsm(page))) - return 0; - /* The page is part of THP and cannot be reused */ - if (PageTransCompound(page)) - return 0; - count = page_mapcount(page); + return false; + count = page_trans_huge_mapcount(page, total_mapcount); if (count <= 1 && PageSwapCache(page)) { count += page_swapcount(page); if (count == 1 && !PageWriteback(page)) { diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index d97268e8ff10..2b68418c7198 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -975,6 +975,8 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) val = 65535 - 40; if (type == RTAX_MTU && val > 65535 - 15) val = 65535 - 15; + if (type == RTAX_HOPLIMIT && val > 255) + val = 255; if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) return -EINVAL; fi->fib_metrics[type - 1] = val; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 205a2b8a5a84..4cc84212cce1 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -398,7 +398,10 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) iph->saddr, iph->daddr, tpi->key); if (tunnel) { - skb_pop_mac_header(skb); + if (tunnel->dev->type != ARPHRD_NONE) + skb_pop_mac_header(skb); + else + skb_reset_mac_header(skb); if (tunnel->collect_md) { __be16 flags; __be64 tun_id; @@ -1031,6 +1034,8 @@ static void ipgre_netlink_parms(struct net_device *dev, struct ip_tunnel *t = netdev_priv(dev); t->collect_md = true; + if (dev->type == ARPHRD_IPGRE) + dev->type = ARPHRD_NONE; } } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 441ae9da3a23..79a03b87a771 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2640,8 +2640,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) */ if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) || skb_headroom(skb) >= 0xFFFF)) { - struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER, - GFP_ATOMIC); + struct sk_buff *nskb; + + skb_mstamp_get(&skb->skb_mstamp); + nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : -ENOBUFS; } else { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d916d6ab9ad2..6f32944e0223 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1750,6 +1750,8 @@ static int ip6_convert_metrics(struct mx6_config *mxc, } else { val = nla_get_u32(nla); } + if (type == RTAX_HOPLIMIT && val > 255) + val = 255; if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) goto err; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index afde5f5e728a..e27fd17c6743 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -66,7 +66,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_locks); __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); -static __read_mostly spinlock_t nf_conntrack_locks_all_lock; +static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); static __read_mostly bool nf_conntrack_locks_all; void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) @@ -1778,6 +1778,7 @@ void nf_conntrack_init_end(void) int nf_conntrack_init_net(struct net *net) { + static atomic64_t unique_id; int ret = -ENOMEM; int cpu; @@ -1800,7 +1801,8 @@ int nf_conntrack_init_net(struct net *net) if (!net->ct.stat) goto err_pcpu_lists; - net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net); + net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%llu", + (u64)atomic64_inc_return(&unique_id)); if (!net->ct.slabname) goto err_slabname; diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 4c2b4c0c4d5f..dbd0803b1827 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -96,6 +96,8 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl, return -EINVAL; if (flags & NFACCT_F_OVERQUOTA) return -EINVAL; + if ((flags & NFACCT_F_QUOTA) && !tb[NFACCT_QUOTA]) + return -EINVAL; size += sizeof(u64); } diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index 29d2c31f406c..daf45da448fa 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -236,6 +236,7 @@ static void idletimer_tg_destroy(const struct xt_tgdtor_param *par) list_del(&info->timer->entry); del_timer_sync(&info->timer->timer); + cancel_work_sync(&info->timer->work); sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); kfree(info->timer->attr.attr.name); kfree(info->timer); diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index b5fea1101faa..10c84d882881 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -776,6 +776,19 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, return -EINVAL; } + /* Userspace may decide to perform a ct lookup without a helper + * specified followed by a (recirculate and) commit with one. + * Therefore, for unconfirmed connections which we will commit, + * we need to attach the helper here. + */ + if (!nf_ct_is_confirmed(ct) && info->commit && + info->helper && !nfct_help(ct)) { + int err = __nf_ct_try_assign_helper(ct, info->ct, + GFP_ATOMIC); + if (err) + return err; + } + /* Call the helper only if: * - nf_conntrack_in() was executed above ("!cached") for a * confirmed connection, or diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index c589a9ba506a..343d011aa818 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -423,7 +423,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, u16 ife_type = 0; u8 *daddr = NULL; u8 *saddr = NULL; - int ret = 0; + int ret = 0, exists = 0; int err; err = nla_parse_nested(tb, TCA_IFE_MAX, nla, ife_policy); @@ -435,25 +435,29 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_IFE_PARMS]); + exists = tcf_hash_check(tn, parm->index, a, bind); + if (exists && bind) + return 0; + if (parm->flags & IFE_ENCODE) { /* Until we get issued the ethertype, we cant have * a default.. **/ if (!tb[TCA_IFE_TYPE]) { + if (exists) + tcf_hash_release(a, bind); pr_info("You MUST pass etherype for encoding\n"); return -EINVAL; } } - if (!tcf_hash_check(tn, parm->index, a, bind)) { + if (!exists) { ret = tcf_hash_create(tn, parm->index, est, a, sizeof(*ife), bind, false); if (ret) return ret; ret = ACT_P_CREATED; } else { - if (bind) /* dont override defaults */ - return 0; tcf_hash_release(a, bind); if (!ovr) return -EEXIST; @@ -495,6 +499,8 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, NULL); if (err) { metadata_parse_err: + if (exists) + tcf_hash_release(a, bind); if (ret == ACT_P_CREATED) _tcf_ife_cleanup(a, bind); diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 350e134cffb3..8b5270008a6e 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -96,7 +96,7 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, struct tcf_ipt *ipt; struct xt_entry_target *td, *t; char *tname; - int ret = 0, err; + int ret = 0, err, exists = 0; u32 hook = 0; u32 index = 0; @@ -107,18 +107,23 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, if (err < 0) return err; - if (tb[TCA_IPT_HOOK] == NULL) - return -EINVAL; - if (tb[TCA_IPT_TARG] == NULL) + if (tb[TCA_IPT_INDEX] != NULL) + index = nla_get_u32(tb[TCA_IPT_INDEX]); + + exists = tcf_hash_check(tn, index, a, bind); + if (exists && bind) + return 0; + + if (tb[TCA_IPT_HOOK] == NULL || tb[TCA_IPT_TARG] == NULL) { + if (exists) + tcf_hash_release(a, bind); return -EINVAL; + } td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]); if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size) return -EINVAL; - if (tb[TCA_IPT_INDEX] != NULL) - index = nla_get_u32(tb[TCA_IPT_INDEX]); - if (!tcf_hash_check(tn, index, a, bind)) { ret = tcf_hash_create(tn, index, est, a, sizeof(*ipt), bind, false); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index e8a760cf7775..8f3948dd38b8 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -61,7 +61,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct tc_mirred *parm; struct tcf_mirred *m; struct net_device *dev; - int ret, ok_push = 0; + int ret, ok_push = 0, exists = 0; if (nla == NULL) return -EINVAL; @@ -71,17 +71,27 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, if (tb[TCA_MIRRED_PARMS] == NULL) return -EINVAL; parm = nla_data(tb[TCA_MIRRED_PARMS]); + + exists = tcf_hash_check(tn, parm->index, a, bind); + if (exists && bind) + return 0; + switch (parm->eaction) { case TCA_EGRESS_MIRROR: case TCA_EGRESS_REDIR: break; default: + if (exists) + tcf_hash_release(a, bind); return -EINVAL; } if (parm->ifindex) { dev = __dev_get_by_index(net, parm->ifindex); - if (dev == NULL) + if (dev == NULL) { + if (exists) + tcf_hash_release(a, bind); return -ENODEV; + } switch (dev->type) { case ARPHRD_TUNNEL: case ARPHRD_TUNNEL6: @@ -99,7 +109,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, dev = NULL; } - if (!tcf_hash_check(tn, parm->index, a, bind)) { + if (!exists) { if (dev == NULL) return -EINVAL; ret = tcf_hash_create(tn, parm->index, est, a, @@ -108,9 +118,6 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, return ret; ret = ACT_P_CREATED; } else { - if (bind) - return 0; - tcf_hash_release(a, bind); if (!ovr) return -EEXIST; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 75b2be13fbcc..3a33fb648a6d 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -87,7 +87,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, struct tc_defact *parm; struct tcf_defact *d; char *defdata; - int ret = 0, err; + int ret = 0, err, exists = 0; if (nla == NULL) return -EINVAL; @@ -99,13 +99,21 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, if (tb[TCA_DEF_PARMS] == NULL) return -EINVAL; - if (tb[TCA_DEF_DATA] == NULL) - return -EINVAL; parm = nla_data(tb[TCA_DEF_PARMS]); + exists = tcf_hash_check(tn, parm->index, a, bind); + if (exists && bind) + return 0; + + if (tb[TCA_DEF_DATA] == NULL) { + if (exists) + tcf_hash_release(a, bind); + return -EINVAL; + } + defdata = nla_data(tb[TCA_DEF_DATA]); - if (!tcf_hash_check(tn, parm->index, a, bind)) { + if (!exists) { ret = tcf_hash_create(tn, parm->index, est, a, sizeof(*d), bind, false); if (ret) @@ -122,8 +130,6 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, } else { d = to_defact(a); - if (bind) - return 0; tcf_hash_release(a, bind); if (!ovr) return -EEXIST; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index cfcdbdc00c9b..69da5a8f0034 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -69,7 +69,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct tcf_skbedit *d; u32 flags = 0, *priority = NULL, *mark = NULL; u16 *queue_mapping = NULL; - int ret = 0, err; + int ret = 0, err, exists = 0; if (nla == NULL) return -EINVAL; @@ -96,12 +96,18 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, mark = nla_data(tb[TCA_SKBEDIT_MARK]); } - if (!flags) - return -EINVAL; - parm = nla_data(tb[TCA_SKBEDIT_PARMS]); - if (!tcf_hash_check(tn, parm->index, a, bind)) { + exists = tcf_hash_check(tn, parm->index, a, bind); + if (exists && bind) + return 0; + + if (!flags) { + tcf_hash_release(a, bind); + return -EINVAL; + } + + if (!exists) { ret = tcf_hash_create(tn, parm->index, est, a, sizeof(*d), bind, false); if (ret) @@ -111,8 +117,6 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, ret = ACT_P_CREATED; } else { d = to_skbedit(a); - if (bind) - return 0; tcf_hash_release(a, bind); if (!ovr) return -EEXIST; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index bab8ae0cefc0..c45f926dafb9 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -77,7 +77,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, int action; __be16 push_vid = 0; __be16 push_proto = 0; - int ret = 0; + int ret = 0, exists = 0; int err; if (!nla) @@ -90,15 +90,25 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, if (!tb[TCA_VLAN_PARMS]) return -EINVAL; parm = nla_data(tb[TCA_VLAN_PARMS]); + exists = tcf_hash_check(tn, parm->index, a, bind); + if (exists && bind) + return 0; + switch (parm->v_action) { case TCA_VLAN_ACT_POP: break; case TCA_VLAN_ACT_PUSH: - if (!tb[TCA_VLAN_PUSH_VLAN_ID]) + if (!tb[TCA_VLAN_PUSH_VLAN_ID]) { + if (exists) + tcf_hash_release(a, bind); return -EINVAL; + } push_vid = nla_get_u16(tb[TCA_VLAN_PUSH_VLAN_ID]); - if (push_vid >= VLAN_VID_MASK) + if (push_vid >= VLAN_VID_MASK) { + if (exists) + tcf_hash_release(a, bind); return -ERANGE; + } if (tb[TCA_VLAN_PUSH_VLAN_PROTOCOL]) { push_proto = nla_get_be16(tb[TCA_VLAN_PUSH_VLAN_PROTOCOL]); @@ -114,11 +124,13 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, } break; default: + if (exists) + tcf_hash_release(a, bind); return -EINVAL; } action = parm->v_action; - if (!tcf_hash_check(tn, parm->index, a, bind)) { + if (!exists) { ret = tcf_hash_create(tn, parm->index, est, a, sizeof(*v), bind, false); if (ret) @@ -126,8 +138,6 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, ret = ACT_P_CREATED; } else { - if (bind) - return 0; tcf_hash_release(a, bind); if (!ovr) return -EEXIST; diff --git a/net/x25/x25_facilities.c b/net/x25/x25_facilities.c index 7ecd04c21360..997ff7b2509b 100644 --- a/net/x25/x25_facilities.c +++ b/net/x25/x25_facilities.c @@ -277,6 +277,7 @@ int x25_negotiate_facilities(struct sk_buff *skb, struct sock *sk, memset(&theirs, 0, sizeof(theirs)); memcpy(new, ours, sizeof(*new)); + memset(dte, 0, sizeof(*dte)); len = x25_parse_facilities(skb, &theirs, dte, &x25->vc_facil_mask); if (len < 0) diff --git a/sound/pci/hda/hda_sysfs.c b/sound/pci/hda/hda_sysfs.c index 64e0d1d81ca5..9739fce9e032 100644 --- a/sound/pci/hda/hda_sysfs.c +++ b/sound/pci/hda/hda_sysfs.c @@ -141,14 +141,6 @@ static int reconfig_codec(struct hda_codec *codec) err = snd_hda_codec_configure(codec); if (err < 0) goto error; - /* rebuild PCMs */ - err = snd_hda_codec_build_pcms(codec); - if (err < 0) - goto error; - /* rebuild mixers */ - err = snd_hda_codec_build_controls(codec); - if (err < 0) - goto error; err = snd_card_register(codec->card); error: snd_hda_power_down(codec); diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c index 1483f85999ec..a010d704e0e2 100644 --- a/sound/pci/hda/patch_hdmi.c +++ b/sound/pci/hda/patch_hdmi.c @@ -3401,6 +3401,9 @@ static int patch_atihdmi(struct hda_codec *codec) spec->ops.pin_hbr_setup = atihdmi_pin_hbr_setup; spec->ops.setup_stream = atihdmi_setup_stream; + spec->chmap.ops.pin_get_slot_channel = atihdmi_pin_get_slot_channel; + spec->chmap.ops.pin_set_slot_channel = atihdmi_pin_set_slot_channel; + if (!has_amd_full_remap_support(codec)) { /* override to ATI/AMD-specific versions with pairwise mapping */ spec->chmap.ops.chmap_cea_alloc_validate_get_type = @@ -3408,10 +3411,6 @@ static int patch_atihdmi(struct hda_codec *codec) spec->chmap.ops.cea_alloc_to_tlv_chmap = atihdmi_paired_cea_alloc_to_tlv_chmap; spec->chmap.ops.chmap_validate = atihdmi_paired_chmap_validate; - spec->chmap.ops.pin_get_slot_channel = - atihdmi_pin_get_slot_channel; - spec->chmap.ops.pin_set_slot_channel = - atihdmi_pin_set_slot_channel; } /* ATI/AMD converters do not advertise all of their capabilities */ diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index ac4490a96863..4918ffa5ba68 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6426,6 +6426,7 @@ enum { ALC668_FIXUP_DELL_DISABLE_AAMIX, ALC668_FIXUP_DELL_XPS13, ALC662_FIXUP_ASUS_Nx50, + ALC668_FIXUP_ASUS_Nx51, }; static const struct hda_fixup alc662_fixups[] = { @@ -6672,6 +6673,15 @@ static const struct hda_fixup alc662_fixups[] = { .chained = true, .chain_id = ALC662_FIXUP_BASS_1A }, + [ALC668_FIXUP_ASUS_Nx51] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + {0x1a, 0x90170151}, /* bass speaker */ + {} + }, + .chained = true, + .chain_id = ALC662_FIXUP_BASS_CHMAP, + }, }; static const struct snd_pci_quirk alc662_fixup_tbl[] = { @@ -6694,11 +6704,14 @@ static const struct snd_pci_quirk alc662_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x0698, "Dell", ALC668_FIXUP_DELL_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x069f, "Dell", ALC668_FIXUP_DELL_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x103c, 0x1632, "HP RP5800", ALC662_FIXUP_HP_RP5800), + SND_PCI_QUIRK(0x1043, 0x1080, "Asus UX501VW", ALC668_FIXUP_HEADSET_MODE), SND_PCI_QUIRK(0x1043, 0x11cd, "Asus N550", ALC662_FIXUP_ASUS_Nx50), SND_PCI_QUIRK(0x1043, 0x13df, "Asus N550JX", ALC662_FIXUP_BASS_1A), SND_PCI_QUIRK(0x1043, 0x129d, "Asus N750", ALC662_FIXUP_ASUS_Nx50), SND_PCI_QUIRK(0x1043, 0x1477, "ASUS N56VZ", ALC662_FIXUP_BASS_MODE4_CHMAP), SND_PCI_QUIRK(0x1043, 0x15a7, "ASUS UX51VZH", ALC662_FIXUP_BASS_16), + SND_PCI_QUIRK(0x1043, 0x177d, "ASUS N551", ALC668_FIXUP_ASUS_Nx51), + SND_PCI_QUIRK(0x1043, 0x17bd, "ASUS N751", ALC668_FIXUP_ASUS_Nx51), SND_PCI_QUIRK(0x1043, 0x1b73, "ASUS N55SF", ALC662_FIXUP_BASS_16), SND_PCI_QUIRK(0x1043, 0x1bf3, "ASUS N76VZ", ALC662_FIXUP_BASS_MODE4_CHMAP), SND_PCI_QUIRK(0x1043, 0x8469, "ASUS mobo", ALC662_FIXUP_NO_JACK_DETECT), diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 0adfd9537cf7..6adde457b602 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1137,8 +1137,11 @@ bool snd_usb_get_sample_rate_quirk(struct snd_usb_audio *chip) case USB_ID(0x047F, 0x0415): /* Plantronics BT-300 */ case USB_ID(0x047F, 0xAA05): /* Plantronics DA45 */ case USB_ID(0x04D8, 0xFEEA): /* Benchmark DAC1 Pre */ + case USB_ID(0x0556, 0x0014): /* Phoenix Audio TMX320VC */ case USB_ID(0x074D, 0x3553): /* Outlaw RR2150 (Micronas UAC3553B) */ + case USB_ID(0x1de7, 0x0013): /* Phoenix Audio MT202exe */ case USB_ID(0x1de7, 0x0014): /* Phoenix Audio TMX320 */ + case USB_ID(0x1de7, 0x0114): /* Phoenix Audio MT202pcs */ case USB_ID(0x21B4, 0x0081): /* AudioQuest DragonFly */ return true; } diff --git a/tools/lib/traceevent/parse-filter.c b/tools/lib/traceevent/parse-filter.c index 0144b3d1bb77..88cccea3ca99 100644 --- a/tools/lib/traceevent/parse-filter.c +++ b/tools/lib/traceevent/parse-filter.c @@ -1164,11 +1164,11 @@ process_filter(struct event_format *event, struct filter_arg **parg, current_op = current_exp; ret = collapse_tree(current_op, parg, error_str); + /* collapse_tree() may free current_op, and updates parg accordingly */ + current_op = NULL; if (ret < 0) goto fail; - *parg = current_op; - free(token); return 0; diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 5645a8361de6..e459b685a4e9 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -536,6 +536,7 @@ static int __run_perf_stat(int argc, const char **argv) perf_evlist__set_leader(evsel_list); evlist__for_each(evsel_list, counter) { +try_again: if (create_perf_stat_counter(counter) < 0) { /* * PPC returns ENXIO for HW counters until 2.6.37 @@ -552,7 +553,11 @@ static int __run_perf_stat(int argc, const char **argv) if ((counter->leader != counter) || !(counter->leader->nr_members > 1)) continue; - } + } else if (perf_evsel__fallback(counter, errno, msg, sizeof(msg))) { + if (verbose) + ui__warning("%s\n", msg); + goto try_again; + } perf_evsel__open_strerror(counter, &target, errno, msg, sizeof(msg)); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index a23f54793e51..964c7c3602c0 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2274,6 +2274,8 @@ u64 perf_evsel__intval(struct perf_evsel *evsel, struct perf_sample *sample, bool perf_evsel__fallback(struct perf_evsel *evsel, int err, char *msg, size_t msgsize) { + int paranoid; + if ((err == ENOENT || err == ENXIO || err == ENODEV) && evsel->attr.type == PERF_TYPE_HARDWARE && evsel->attr.config == PERF_COUNT_HW_CPU_CYCLES) { @@ -2293,6 +2295,22 @@ bool perf_evsel__fallback(struct perf_evsel *evsel, int err, zfree(&evsel->name); return true; + } else if (err == EACCES && !evsel->attr.exclude_kernel && + (paranoid = perf_event_paranoid()) > 1) { + const char *name = perf_evsel__name(evsel); + char *new_name; + + if (asprintf(&new_name, "%s%su", name, strchr(name, ':') ? "" : ":") < 0) + return false; + + if (evsel->name) + free(evsel->name); + evsel->name = new_name; + scnprintf(msg, msgsize, +"kernel.perf_event_paranoid=%d, trying to fall back to excluding kernel samples", paranoid); + evsel->attr.exclude_kernel = 1; + + return true; } return false; @@ -2311,12 +2329,13 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target, "Consider tweaking /proc/sys/kernel/perf_event_paranoid,\n" "which controls use of the performance events system by\n" "unprivileged users (without CAP_SYS_ADMIN).\n\n" - "The default value is 1:\n\n" + "The current value is %d:\n\n" " -1: Allow use of (almost) all events by all users\n" ">= 0: Disallow raw tracepoint access by users without CAP_IOC_LOCK\n" ">= 1: Disallow CPU event access by users without CAP_SYS_ADMIN\n" ">= 2: Disallow kernel profiling by users without CAP_SYS_ADMIN", - target->system_wide ? "system-wide " : ""); + target->system_wide ? "system-wide " : "", + perf_event_paranoid()); case ENOENT: return scnprintf(msg, size, "The %s event is not supported.", perf_evsel__name(evsel)); diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index b04afc3295df..ff9e5f20a5a7 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -19,6 +19,7 @@ TARGETS += powerpc TARGETS += pstore TARGETS += ptrace TARGETS += seccomp +TARGETS += sigaltstack TARGETS += size TARGETS += static_keys TARGETS += sysctl diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh b/tools/testing/selftests/rcutorture/bin/jitter.sh new file mode 100755 index 000000000000..3633828375e3 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/jitter.sh @@ -0,0 +1,90 @@ +#!/bin/bash +# +# Alternate sleeping and spinning on randomly selected CPUs. The purpose +# of this script is to inflict random OS jitter on a concurrently running +# test. +# +# Usage: jitter.sh me duration [ sleepmax [ spinmax ] ] +# +# me: Random-number-generator seed salt. +# duration: Time to run in seconds. +# sleepmax: Maximum microseconds to sleep, defaults to one second. +# spinmax: Maximum microseconds to spin, defaults to one millisecond. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, you can access it online at +# http://www.gnu.org/licenses/gpl-2.0.html. +# +# Copyright (C) IBM Corporation, 2016 +# +# Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + +me=$(($1 * 1000)) +duration=$2 +sleepmax=${3-1000000} +spinmax=${4-1000} + +n=1 + +starttime=`awk 'BEGIN { print systime(); }' < /dev/null` + +while : +do + # Check for done. + t=`awk -v s=$starttime 'BEGIN { print systime() - s; }' < /dev/null` + if test "$t" -gt "$duration" + then + exit 0; + fi + + # Set affinity to randomly selected CPU + cpus=`ls /sys/devices/system/cpu/*/online | + sed -e 's,/[^/]*$,,' -e 's/^[^0-9]*//' | + grep -v '^0*$'` + cpumask=`awk -v cpus="$cpus" -v me=$me -v n=$n 'BEGIN { + srand(n + me + systime()); + ncpus = split(cpus, ca); + curcpu = ca[int(rand() * ncpus + 1)]; + mask = lshift(1, curcpu); + if (mask + 0 <= 0) + mask = 1; + printf("%#x\n", mask); + }' < /dev/null` + n=$(($n+1)) + if ! taskset -p $cpumask $$ > /dev/null 2>&1 + then + echo taskset failure: '"taskset -p ' $cpumask $$ '"' + exit 1 + fi + + # Sleep a random duration + sleeptime=`awk -v me=$me -v n=$n -v sleepmax=$sleepmax 'BEGIN { + srand(n + me + systime()); + printf("%06d", int(rand() * sleepmax)); + }' < /dev/null` + n=$(($n+1)) + sleep .$sleeptime + + # Spin a random duration + limit=`awk -v me=$me -v n=$n -v spinmax=$spinmax 'BEGIN { + srand(n + me + systime()); + printf("%06d", int(rand() * spinmax)); + }' < /dev/null` + n=$(($n+1)) + for i in {1..$limit} + do + echo > /dev/null + done +done + +exit 1 diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh new file mode 100755 index 000000000000..f79b0e9e84fc --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh @@ -0,0 +1,121 @@ +#!/bin/bash +# +# Analyze a given results directory for rcuperf performance measurements, +# looking for ftrace data. Exits with 0 if data was found, analyzed, and +# printed. Intended to be invoked from kvm-recheck-rcuperf.sh after +# argument checking. +# +# Usage: kvm-recheck-rcuperf-ftrace.sh resdir +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, you can access it online at +# http://www.gnu.org/licenses/gpl-2.0.html. +# +# Copyright (C) IBM Corporation, 2016 +# +# Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + +i="$1" +. tools/testing/selftests/rcutorture/bin/functions.sh + +if test "`grep -c 'rcu_exp_grace_period.*start' < $i/console.log`" -lt 100 +then + exit 10 +fi + +sed -e 's/^\[[^]]*]//' < $i/console.log | +grep 'us : rcu_exp_grace_period' | +sed -e 's/us : / : /' | +tr -d '\015' | +awk ' +$8 == "start" { + if (starttask != "") + nlost++; + starttask = $1; + starttime = $3; + startseq = $7; +} + +$8 == "end" { + if (starttask == $1 && startseq == $7) { + curgpdur = $3 - starttime; + gptimes[++n] = curgpdur; + gptaskcnt[starttask]++; + sum += curgpdur; + if (curgpdur > 1000) + print "Long GP " starttime "us to " $3 "us (" curgpdur "us)"; + starttask = ""; + } else { + # Lost a message or some such, reset. + starttask = ""; + nlost++; + } +} + +$8 == "done" { + piggybackcnt[$1]++; +} + +END { + newNR = asort(gptimes); + if (newNR <= 0) { + print "No ftrace records found???" + exit 10; + } + pct50 = int(newNR * 50 / 100); + if (pct50 < 1) + pct50 = 1; + pct90 = int(newNR * 90 / 100); + if (pct90 < 1) + pct90 = 1; + pct99 = int(newNR * 99 / 100); + if (pct99 < 1) + pct99 = 1; + div = 10 ** int(log(gptimes[pct90]) / log(10) + .5) / 100; + print "Histogram bucket size: " div; + last = gptimes[1] - 10; + count = 0; + for (i = 1; i <= newNR; i++) { + current = div * int(gptimes[i] / div); + if (last == current) { + count++; + } else { + if (count > 0) + print last, count; + count = 1; + last = current; + } + } + if (count > 0) + print last, count; + print "Distribution of grace periods across tasks:"; + for (i in gptaskcnt) { + print "\t" i, gptaskcnt[i]; + nbatches += gptaskcnt[i]; + } + ngps = nbatches; + print "Distribution of piggybacking across tasks:"; + for (i in piggybackcnt) { + print "\t" i, piggybackcnt[i]; + ngps += piggybackcnt[i]; + } + print "Average grace-period duration: " sum / newNR " microseconds"; + print "Minimum grace-period duration: " gptimes[1]; + print "50th percentile grace-period duration: " gptimes[pct50]; + print "90th percentile grace-period duration: " gptimes[pct90]; + print "99th percentile grace-period duration: " gptimes[pct99]; + print "Maximum grace-period duration: " gptimes[newNR]; + print "Grace periods: " ngps + 0 " Batches: " nbatches + 0 " Ratio: " ngps / nbatches " Lost: " nlost + 0; + print "Computed from ftrace data."; +}' +exit 0 diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh new file mode 100755 index 000000000000..8f3121afc716 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# +# Analyze a given results directory for rcuperf performance measurements. +# +# Usage: kvm-recheck-rcuperf.sh resdir +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, you can access it online at +# http://www.gnu.org/licenses/gpl-2.0.html. +# +# Copyright (C) IBM Corporation, 2016 +# +# Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + +i="$1" +if test -d $i +then + : +else + echo Unreadable results directory: $i + exit 1 +fi +PATH=`pwd`/tools/testing/selftests/rcutorture/bin:$PATH; export PATH +. tools/testing/selftests/rcutorture/bin/functions.sh + +if kvm-recheck-rcuperf-ftrace.sh $i +then + # ftrace data was successfully analyzed, call it good! + exit 0 +fi + +configfile=`echo $i | sed -e 's/^.*\///'` + +sed -e 's/^\[[^]]*]//' < $i/console.log | +awk ' +/-perf: .* gps: .* batches:/ { + ngps = $9; + nbatches = $11; +} + +/-perf: .*writer-duration/ { + gptimes[++n] = $5 / 1000.; + sum += $5 / 1000.; +} + +END { + newNR = asort(gptimes); + if (newNR <= 0) { + print "No rcuperf records found???" + exit; + } + pct50 = int(newNR * 50 / 100); + if (pct50 < 1) + pct50 = 1; + pct90 = int(newNR * 90 / 100); + if (pct90 < 1) + pct90 = 1; + pct99 = int(newNR * 99 / 100); + if (pct99 < 1) + pct99 = 1; + div = 10 ** int(log(gptimes[pct90]) / log(10) + .5) / 100; + print "Histogram bucket size: " div; + last = gptimes[1] - 10; + count = 0; + for (i = 1; i <= newNR; i++) { + current = div * int(gptimes[i] / div); + if (last == current) { + count++; + } else { + if (count > 0) + print last, count; + count = 1; + last = current; + } + } + if (count > 0) + print last, count; + print "Average grace-period duration: " sum / newNR " microseconds"; + print "Minimum grace-period duration: " gptimes[1]; + print "50th percentile grace-period duration: " gptimes[pct50]; + print "90th percentile grace-period duration: " gptimes[pct90]; + print "99th percentile grace-period duration: " gptimes[pct99]; + print "Maximum grace-period duration: " gptimes[newNR]; + print "Grace periods: " ngps + 0 " Batches: " nbatches + 0 " Ratio: " ngps / nbatches; + print "Computed from rcuperf printk output."; +}' diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh index d86bdd6b6cc2..f659346d3358 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh @@ -48,7 +48,10 @@ do cat $i/Make.oldconfig.err fi parse-build.sh $i/Make.out $configfile - parse-torture.sh $i/console.log $configfile + if test "$TORTURE_SUITE" != rcuperf + then + parse-torture.sh $i/console.log $configfile + fi parse-console.sh $i/console.log $configfile if test -r $i/Warnings then diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 0f80eefb0bfd..4109f306d855 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -6,7 +6,7 @@ # Execute this in the source tree. Do not run it as a background task # because qemu does not seem to like that much. # -# Usage: kvm-test-1-run.sh config builddir resdir minutes qemu-args boot_args +# Usage: kvm-test-1-run.sh config builddir resdir seconds qemu-args boot_args # # qemu-args defaults to "-enable-kvm -soundhw pcspk -nographic", along with # arguments specifying the number of CPUs and other @@ -91,25 +91,33 @@ fi # CONFIG_PCMCIA=n # CONFIG_CARDBUS=n # CONFIG_YENTA=n -if kvm-build.sh $config_template $builddir $T +base_resdir=`echo $resdir | sed -e 's/\.[0-9]\+$//'` +if test "$base_resdir" != "$resdir" -a -f $base_resdir/bzImage -a -f $base_resdir/vmlinux then + # Rerunning previous test, so use that test's kernel. + QEMU="`identify_qemu $base_resdir/vmlinux`" + KERNEL=$base_resdir/bzImage + ln -s $base_resdir/Make*.out $resdir # for kvm-recheck.sh + ln -s $base_resdir/.config $resdir # for kvm-recheck.sh +elif kvm-build.sh $config_template $builddir $T +then + # Had to build a kernel for this test. QEMU="`identify_qemu $builddir/vmlinux`" BOOT_IMAGE="`identify_boot_image $QEMU`" cp $builddir/Make*.out $resdir + cp $builddir/vmlinux $resdir cp $builddir/.config $resdir if test -n "$BOOT_IMAGE" then cp $builddir/$BOOT_IMAGE $resdir + KERNEL=$resdir/bzImage else echo No identifiable boot image, not running KVM, see $resdir. echo Do the torture scripts know about your architecture? fi parse-build.sh $resdir/Make.out $title - if test -f $builddir.wait - then - mv $builddir.wait $builddir.ready - fi else + # Build failed. cp $builddir/Make*.out $resdir cp $builddir/.config $resdir || : echo Build failed, not running KVM, see $resdir. @@ -119,12 +127,15 @@ else fi exit 1 fi +if test -f $builddir.wait +then + mv $builddir.wait $builddir.ready +fi while test -f $builddir.ready do sleep 1 done -minutes=$4 -seconds=$(($minutes * 60)) +seconds=$4 qemu_args=$5 boot_args=$6 @@ -167,15 +178,26 @@ then exit 0 fi echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log -echo $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd -( $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append "$qemu_append $boot_args"; echo $? > $resdir/qemu-retval ) & -qemu_pid=$! +echo $QEMU $qemu_args -m 512 -kernel $KERNEL -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd +( $QEMU $qemu_args -m 512 -kernel $KERNEL -append "$qemu_append $boot_args"& echo $! > $resdir/qemu_pid; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) & commandcompleted=0 -echo Monitoring qemu job at pid $qemu_pid +sleep 10 # Give qemu's pid a chance to reach the file +if test -s "$resdir/qemu_pid" +then + qemu_pid=`cat "$resdir/qemu_pid"` + echo Monitoring qemu job at pid $qemu_pid +else + qemu_pid="" + echo Monitoring qemu job at yet-as-unknown pid +fi while : do + if test -z "$qemu_pid" -a -s "$resdir/qemu_pid" + then + qemu_pid=`cat "$resdir/qemu_pid"` + fi kruntime=`awk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null` - if kill -0 $qemu_pid > /dev/null 2>&1 + if test -z "$qemu_pid" || kill -0 "$qemu_pid" > /dev/null 2>&1 then if test $kruntime -ge $seconds then @@ -195,12 +217,16 @@ do ps -fp $killpid >> $resdir/Warnings 2>&1 fi else - echo ' ---' `date`: Kernel done + echo ' ---' `date`: "Kernel done" fi break fi done -if test $commandcompleted -eq 0 +if test -z "$qemu_pid" -a -s "$resdir/qemu_pid" +then + qemu_pid=`cat "$resdir/qemu_pid"` +fi +if test $commandcompleted -eq 0 -a -n "$qemu_pid" then echo Grace period for qemu job at pid $qemu_pid while : @@ -220,6 +246,9 @@ then fi sleep 1 done +elif test -z "$qemu_pid" +then + echo Unknown PID, cannot kill qemu command fi parse-torture.sh $resdir/console.log $title diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 4a431767f77a..0d598145873e 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -34,7 +34,7 @@ T=/tmp/kvm.sh.$$ trap 'rm -rf $T' 0 mkdir $T -dur=30 +dur=$((30*60)) dryrun="" KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM PATH=${KVM}/bin:$PATH; export PATH @@ -48,6 +48,7 @@ resdir="" configs="" cpus=0 ds=`date +%Y.%m.%d-%H:%M:%S` +jitter=0 . functions.sh @@ -63,6 +64,7 @@ usage () { echo " --dryrun sched|script" echo " --duration minutes" echo " --interactive" + echo " --jitter N [ maxsleep (us) [ maxspin (us) ] ]" echo " --kmake-arg kernel-make-arguments" echo " --mac nn:nn:nn:nn:nn:nn" echo " --no-initrd" @@ -116,12 +118,17 @@ do ;; --duration) checkarg --duration "(minutes)" $# "$2" '^[0-9]*$' '^error' - dur=$2 + dur=$(($2*60)) shift ;; --interactive) TORTURE_QEMU_INTERACTIVE=1; export TORTURE_QEMU_INTERACTIVE ;; + --jitter) + checkarg --jitter "(# threads [ sleep [ spin ] ])" $# "$2" '^-\{,1\}[0-9]\+\( \+[0-9]\+\)\{,2\} *$' '^error$' + jitter="$2" + shift + ;; --kmake-arg) checkarg --kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$' TORTURE_KMAKE_ARG="$2" @@ -156,7 +163,7 @@ do shift ;; --torture) - checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\)$' '^--' + checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuperf\)$' '^--' TORTURE_SUITE=$2 shift ;; @@ -299,6 +306,7 @@ awk < $T/cfgcpu.pack \ -v CONFIGDIR="$CONFIGFRAG/" \ -v KVM="$KVM" \ -v ncpus=$cpus \ + -v jitter="$jitter" \ -v rd=$resdir/$ds/ \ -v dur=$dur \ -v TORTURE_QEMU_ARG="$TORTURE_QEMU_ARG" \ @@ -359,6 +367,16 @@ function dump(first, pastlast, batchnum) print "\techo ----", cfr[j], cpusr[j] ovf ": Starting kernel. `date` >> " rd "/log"; print "fi" } + njitter = 0; + split(jitter, ja); + if (ja[1] == -1 && ncpus == 0) + njitter = 1; + else if (ja[1] == -1) + njitter = ncpus; + else + njitter = ja[1]; + for (j = 0; j < njitter; j++) + print "jitter.sh " j " " dur " " ja[2] " " ja[3] "&" print "wait" print "if test -z \"$TORTURE_BUILDONLY\"" print "then" diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 index 39a2c6d7d7ec..17cbe098b115 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 @@ -14,7 +14,7 @@ CONFIG_HOTPLUG_CPU=n CONFIG_SUSPEND=n CONFIG_HIBERNATION=n CONFIG_RCU_FANOUT=4 -CONFIG_RCU_FANOUT_LEAF=4 +CONFIG_RCU_FANOUT_LEAF=3 CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot index 0fc8a3428938..e34c33430447 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot @@ -1 +1 @@ -rcutorture.torture_type=rcu_bh +rcutorture.torture_type=rcu_bh rcutree.rcu_fanout_leaf=4 diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/CFLIST b/tools/testing/selftests/rcutorture/configs/rcuperf/CFLIST new file mode 100644 index 000000000000..c9f56cf20775 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcuperf/CFLIST @@ -0,0 +1 @@ +TREE diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon b/tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon new file mode 100644 index 000000000000..a09816b8c0f3 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon @@ -0,0 +1,2 @@ +CONFIG_RCU_PERF_TEST=y +CONFIG_PRINTK_TIME=y diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE new file mode 100644 index 000000000000..a312f671a29a --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE @@ -0,0 +1,20 @@ +CONFIG_SMP=y +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=y +#CHECK#CONFIG_PREEMPT_RCU=y +CONFIG_HZ_PERIODIC=n +CONFIG_NO_HZ_IDLE=y +CONFIG_NO_HZ_FULL=n +CONFIG_RCU_FAST_NO_HZ=n +CONFIG_RCU_TRACE=n +CONFIG_HOTPLUG_CPU=n +CONFIG_SUSPEND=n +CONFIG_HIBERNATION=n +CONFIG_RCU_NOCB_CPU=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n +CONFIG_RCU_BOOST=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_RCU_EXPERT=y +CONFIG_RCU_TRACE=y diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54 b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54 new file mode 100644 index 000000000000..985fb170d13c --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54 @@ -0,0 +1,23 @@ +CONFIG_SMP=y +CONFIG_NR_CPUS=54 +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=y +#CHECK#CONFIG_PREEMPT_RCU=y +CONFIG_HZ_PERIODIC=n +CONFIG_NO_HZ_IDLE=y +CONFIG_NO_HZ_FULL=n +CONFIG_RCU_FAST_NO_HZ=n +CONFIG_RCU_TRACE=n +CONFIG_HOTPLUG_CPU=n +CONFIG_SUSPEND=n +CONFIG_HIBERNATION=n +CONFIG_RCU_FANOUT=3 +CONFIG_RCU_FANOUT_LEAF=2 +CONFIG_RCU_NOCB_CPU=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n +CONFIG_RCU_BOOST=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_RCU_EXPERT=y +CONFIG_RCU_TRACE=y diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh new file mode 100644 index 000000000000..34f2a1b35ee5 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh @@ -0,0 +1,52 @@ +#!/bin/bash +# +# Torture-suite-dependent shell functions for the rest of the scripts. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, you can access it online at +# http://www.gnu.org/licenses/gpl-2.0.html. +# +# Copyright (C) IBM Corporation, 2015 +# +# Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + +# rcuperf_param_nreaders bootparam-string +# +# Adds nreaders rcuperf module parameter if not already specified. +rcuperf_param_nreaders () { + if ! echo "$1" | grep -q "rcuperf.nreaders" + then + echo rcuperf.nreaders=-1 + fi +} + +# rcuperf_param_nwriters bootparam-string +# +# Adds nwriters rcuperf module parameter if not already specified. +rcuperf_param_nwriters () { + if ! echo "$1" | grep -q "rcuperf.nwriters" + then + echo rcuperf.nwriters=-1 + fi +} + +# per_version_boot_params bootparam-string config-file seconds +# +# Adds per-version torture-module parameters to kernels supporting them. +per_version_boot_params () { + echo $1 `rcuperf_param_nreaders "$1"` \ + `rcuperf_param_nwriters "$1"` \ + rcuperf.perf_runnable=1 \ + rcuperf.shutdown=1 \ + rcuperf.verbose=1 +} diff --git a/tools/testing/selftests/sigaltstack/Makefile b/tools/testing/selftests/sigaltstack/Makefile new file mode 100644 index 000000000000..56af56eda6fa --- /dev/null +++ b/tools/testing/selftests/sigaltstack/Makefile @@ -0,0 +1,8 @@ +CFLAGS = -Wall +BINARIES = sas +all: $(BINARIES) + +include ../lib.mk + +clean: + rm -rf $(BINARIES) diff --git a/tools/testing/selftests/sigaltstack/sas.c b/tools/testing/selftests/sigaltstack/sas.c new file mode 100644 index 000000000000..1bb01258e559 --- /dev/null +++ b/tools/testing/selftests/sigaltstack/sas.c @@ -0,0 +1,176 @@ +/* + * Stas Sergeev <stsp@users.sourceforge.net> + * + * test sigaltstack(SS_ONSTACK | SS_AUTODISARM) + * If that succeeds, then swapcontext() can be used inside sighandler safely. + * + */ + +#define _GNU_SOURCE +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <ucontext.h> +#include <alloca.h> +#include <string.h> +#include <assert.h> +#include <errno.h> + +#ifndef SS_AUTODISARM +#define SS_AUTODISARM (1U << 31) +#endif + +static void *sstack, *ustack; +static ucontext_t uc, sc; +static const char *msg = "[OK]\tStack preserved"; +static const char *msg2 = "[FAIL]\tStack corrupted"; +struct stk_data { + char msg[128]; + int flag; +}; + +void my_usr1(int sig, siginfo_t *si, void *u) +{ + char *aa; + int err; + stack_t stk; + struct stk_data *p; + + register unsigned long sp asm("sp"); + + if (sp < (unsigned long)sstack || + sp >= (unsigned long)sstack + SIGSTKSZ) { + printf("[FAIL]\tSP is not on sigaltstack\n"); + exit(EXIT_FAILURE); + } + /* put some data on stack. other sighandler will try to overwrite it */ + aa = alloca(1024); + assert(aa); + p = (struct stk_data *)(aa + 512); + strcpy(p->msg, msg); + p->flag = 1; + printf("[RUN]\tsignal USR1\n"); + err = sigaltstack(NULL, &stk); + if (err) { + perror("[FAIL]\tsigaltstack()"); + exit(EXIT_FAILURE); + } + if (stk.ss_flags != SS_DISABLE) + printf("[FAIL]\tss_flags=%i, should be SS_DISABLE\n", + stk.ss_flags); + else + printf("[OK]\tsigaltstack is disabled in sighandler\n"); + swapcontext(&sc, &uc); + printf("%s\n", p->msg); + if (!p->flag) { + printf("[RUN]\tAborting\n"); + exit(EXIT_FAILURE); + } +} + +void my_usr2(int sig, siginfo_t *si, void *u) +{ + char *aa; + struct stk_data *p; + + printf("[RUN]\tsignal USR2\n"); + aa = alloca(1024); + /* dont run valgrind on this */ + /* try to find the data stored by previous sighandler */ + p = memmem(aa, 1024, msg, strlen(msg)); + if (p) { + printf("[FAIL]\tsigaltstack re-used\n"); + /* corrupt the data */ + strcpy(p->msg, msg2); + /* tell other sighandler that his data is corrupted */ + p->flag = 0; + } +} + +static void switch_fn(void) +{ + printf("[RUN]\tswitched to user ctx\n"); + raise(SIGUSR2); + setcontext(&sc); +} + +int main(void) +{ + struct sigaction act; + stack_t stk; + int err; + + sigemptyset(&act.sa_mask); + act.sa_flags = SA_ONSTACK | SA_SIGINFO; + act.sa_sigaction = my_usr1; + sigaction(SIGUSR1, &act, NULL); + act.sa_sigaction = my_usr2; + sigaction(SIGUSR2, &act, NULL); + sstack = mmap(NULL, SIGSTKSZ, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0); + if (sstack == MAP_FAILED) { + perror("mmap()"); + return EXIT_FAILURE; + } + + err = sigaltstack(NULL, &stk); + if (err) { + perror("[FAIL]\tsigaltstack()"); + exit(EXIT_FAILURE); + } + if (stk.ss_flags == SS_DISABLE) { + printf("[OK]\tInitial sigaltstack state was SS_DISABLE\n"); + } else { + printf("[FAIL]\tInitial sigaltstack state was %i; should have been SS_DISABLE\n", stk.ss_flags); + return EXIT_FAILURE; + } + + stk.ss_sp = sstack; + stk.ss_size = SIGSTKSZ; + stk.ss_flags = SS_ONSTACK | SS_AUTODISARM; + err = sigaltstack(&stk, NULL); + if (err) { + if (errno == EINVAL) { + printf("[NOTE]\tThe running kernel doesn't support SS_AUTODISARM\n"); + /* + * If test cases for the !SS_AUTODISARM variant were + * added, we could still run them. We don't have any + * test cases like that yet, so just exit and report + * success. + */ + return 0; + } else { + perror("[FAIL]\tsigaltstack(SS_ONSTACK | SS_AUTODISARM)"); + return EXIT_FAILURE; + } + } + + ustack = mmap(NULL, SIGSTKSZ, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0); + if (ustack == MAP_FAILED) { + perror("mmap()"); + return EXIT_FAILURE; + } + getcontext(&uc); + uc.uc_link = NULL; + uc.uc_stack.ss_sp = ustack; + uc.uc_stack.ss_size = SIGSTKSZ; + makecontext(&uc, switch_fn, 0); + raise(SIGUSR1); + + err = sigaltstack(NULL, &stk); + if (err) { + perror("[FAIL]\tsigaltstack()"); + exit(EXIT_FAILURE); + } + if (stk.ss_flags != SS_AUTODISARM) { + printf("[FAIL]\tss_flags=%i, should be SS_AUTODISARM\n", + stk.ss_flags); + exit(EXIT_FAILURE); + } + printf("[OK]\tsigaltstack is still SS_AUTODISARM after signal\n"); + + printf("[OK]\tTest passed\n"); + return 0; +} |