From bbf8c67aa6ae8bd588f097510d887dad071f9f43 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Thu, 8 Jan 2026 19:38:36 +0800 Subject: tools: jobserver: Prevent deadlock caused by incorrect jobserver configuration and enhance error reporting When using GNU Make's jobserver feature in kernel builds, a bug in MAKEFLAGS propagation caused "--jobserver-auth=r,w" to reference an unintended file descriptor. This led to infinite loops in jobserver-exec's os.read() calls due to empty token. My shell opened /etc/passwd for some reason without closing it, and as a result, all child processes inherited this fd 3. $ ls -l /proc/self/fd total 0 lrwx------ 1 changbin changbin 64 Dec 25 13:03 0 -> /dev/pts/1 lrwx------ 1 changbin changbin 64 Dec 25 13:03 1 -> /dev/pts/1 lrwx------ 1 changbin changbin 64 Dec 25 13:03 2 -> /dev/pts/1 lr-x------ 1 changbin changbin 64 Dec 25 13:03 3 -> /etc/passwd lr-x------ 1 changbin changbin 64 Dec 25 13:03 4 -> /proc/1421383/fd In this case, the `make` should open a new file descriptor for jobserver control, but clearly, it did not do so and instead still passed fd 3 as "--jobserver-auth=3,4" in MAKEFLAGS. (The version of my gnu make is 4.3) This update ensures robustness against invalid jobserver configurations, even when `make` incorrectly pass non-pipe file descriptors. * Rejecting empty reads to prevent infinite loops on EOF. * Clearing `self.jobs` to avoid writing to incorrect files if invalid tokens are detected. * Printing detailed error messages to stderr to inform the user. Cc: Mauro Carvalho Chehab Reviewed-by: Mauro Carvalho Chehab Signed-off-by: Changbin Du Signed-off-by: Jonathan Corbet Message-ID: <20260108113836.2976527-1-changbin.du@huawei.com> --- tools/lib/python/jobserver.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'tools/lib/python/jobserver.py') diff --git a/tools/lib/python/jobserver.py b/tools/lib/python/jobserver.py index a24f30ef4fa8..616411087725 100755 --- a/tools/lib/python/jobserver.py +++ b/tools/lib/python/jobserver.py @@ -91,6 +91,10 @@ class JobserverExec: while True: try: slot = os.read(self.reader, 8) + if not slot: + # Clear self.jobs to prevent us from probably writing incorrect file. + self.jobs = b"" + raise ValueError("unexpected empty token from jobserver fd, invalid '--jobserver-auth=' setting?") self.jobs += slot except (OSError, IOError) as e: if e.errno == errno.EWOULDBLOCK: @@ -105,7 +109,8 @@ class JobserverExec: # to sit here blocked on our child. self.claim = len(self.jobs) + 1 - except (KeyError, IndexError, ValueError, OSError, IOError): + except (KeyError, IndexError, ValueError, OSError, IOError) as e: + print(f"jobserver: warning: {repr(e)}", file=sys.stderr) # Any missing environment strings or bad fds should result in just # not being parallel. self.claim = None -- cgit v1.2.3 From b2664a90c171846fcd572d93f6f21459721a1d2e Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Mon, 12 Jan 2026 09:19:49 -0700 Subject: jobserver: Split up the big try: block The parsing of jobserver options is done in a massive try: block that hides problems and (perhaps) bugs. Split up that block and make the logic explicit by moving the initial parsing of MAKEFLAGS out of that block. Add warnings in the places things can go wrong. Reviewed-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet --- tools/lib/python/jobserver.py | 143 ++++++++++++++++++++++++++---------------- 1 file changed, 90 insertions(+), 53 deletions(-) (limited to 'tools/lib/python/jobserver.py') diff --git a/tools/lib/python/jobserver.py b/tools/lib/python/jobserver.py index 616411087725..a7c70ff4c375 100755 --- a/tools/lib/python/jobserver.py +++ b/tools/lib/python/jobserver.py @@ -35,6 +35,9 @@ import os import subprocess import sys +def warn(text, *args): + print(f'WARNING: {text}', *args, file = sys.stderr) + class JobserverExec: """ Claim all slots from make using POSIX Jobserver. @@ -58,64 +61,98 @@ class JobserverExec: if self.is_open: return - - try: - # Fetch the make environment options. - flags = os.environ["MAKEFLAGS"] - # Look for "--jobserver=R,W" - # Note that GNU Make has used --jobserver-fds and --jobserver-auth - # so this handles all of them. - opts = [x for x in flags.split(" ") if x.startswith("--jobserver")] - - # Parse out R,W file descriptor numbers and set them nonblocking. - # If the MAKEFLAGS variable contains multiple instances of the - # --jobserver-auth= option, the last one is relevant. - fds = opts[-1].split("=", 1)[1] - - # Starting with GNU Make 4.4, named pipes are used for reader - # and writer. - # Example argument: --jobserver-auth=fifo:/tmp/GMfifo8134 - _, _, path = fds.partition("fifo:") - - if path: + self.is_open = True # We only try once + self.claim = None + # + # Check the make flags for "--jobserver=R,W" + # Note that GNU Make has used --jobserver-fds and --jobserver-auth + # so this handles all of them. + # + flags = os.environ.get('MAKEFLAGS', '') + opts = [x for x in flags.split(" ") if x.startswith("--jobserver")] + if not opts: + return + # + # Separate out the provided file descriptors + # + split_opt = opts[-1].split('=', 1) + if len(split_opt) != 2: + warn('unparseable option:', opts[-1]) + return + fds = split_opt[1] + # + # As of GNU Make 4.4, we'll be looking for a named pipe + # identified as fifo:path + # + if fds.startswith('fifo:'): + path = fds[len('fifo:'):] + try: self.reader = os.open(path, os.O_RDONLY | os.O_NONBLOCK) self.writer = os.open(path, os.O_WRONLY) - else: - self.reader, self.writer = [int(x) for x in fds.split(",", 1)] + except (OSError, IOError): + warn('unable to open jobserver pipe', path) + return + # + # Otherwise look for integer file-descriptor numbers. + # + else: + split_fds = fds.split(',') + if len(split_fds) != 2: + warn('malformed jobserver file descriptors:', fds) + return + try: + self.reader = int(split_fds[0]) + self.writer = int(split_fds[1]) + except ValueError: + warn('non-integer jobserver file-descriptors:', fds) + return + try: + # # Open a private copy of reader to avoid setting nonblocking # on an unexpecting process with the same reader fd. - self.reader = os.open("/proc/self/fd/%d" % (self.reader), + # + self.reader = os.open(f"/proc/self/fd/{self.reader}", os.O_RDONLY | os.O_NONBLOCK) - - # Read out as many jobserver slots as possible - while True: - try: - slot = os.read(self.reader, 8) - if not slot: - # Clear self.jobs to prevent us from probably writing incorrect file. - self.jobs = b"" - raise ValueError("unexpected empty token from jobserver fd, invalid '--jobserver-auth=' setting?") - self.jobs += slot - except (OSError, IOError) as e: - if e.errno == errno.EWOULDBLOCK: - # Stop at the end of the jobserver queue. - break - # If something went wrong, give back the jobs. - if self.jobs: - os.write(self.writer, self.jobs) - raise e - - # Add a bump for our caller's reserveration, since we're just going - # to sit here blocked on our child. - self.claim = len(self.jobs) + 1 - - except (KeyError, IndexError, ValueError, OSError, IOError) as e: - print(f"jobserver: warning: {repr(e)}", file=sys.stderr) - # Any missing environment strings or bad fds should result in just - # not being parallel. - self.claim = None - - self.is_open = True + except (IOError, OSError) as e: + warn('Unable to reopen jobserver read-side pipe:', repr(e)) + return + # + # OK, we have the channel to the job server; read out as many jobserver + # slots as possible. + # + while True: + try: + slot = os.read(self.reader, 8) + if not slot: + # + # Something went wrong. Clear self.jobs to avoid writing + # weirdness back to the jobserver and give up. + self.jobs = b"" + warn("unexpected empty token from jobserver;" + " possible invalid '--jobserver-auth=' setting") + self.claim = None + return + except (OSError, IOError) as e: + # + # If there is nothing more to read then we are done. + # + if e.errno == errno.EWOULDBLOCK: + break + # + # Anything else says that something went weird; give back + # the jobs and give up. + # + if self.jobs: + os.write(self.writer, self.jobs) + self.claim = None + warn('error reading from jobserver pipe', repr(e)) + return + self.jobs += slot + # + # Add a bump for our caller's reserveration, since we're just going + # to sit here blocked on our child. + # + self.claim = len(self.jobs) + 1 def close(self): """Return all reserved slots to Jobserver""" -- cgit v1.2.3 From 8b85f614f3b68a8a58762c8f8defbcebf6f0282a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 19 Jan 2026 17:23:26 +0100 Subject: docs: jobserver: do some documentation improvements Make Sphinx handle better jobserver class documentation Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet Message-ID: <18a9c1406bdead680e3ee5768c97ae8b2138e8ea.1768838938.git.mchehab+huawei@kernel.org> --- tools/lib/python/jobserver.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'tools/lib/python/jobserver.py') diff --git a/tools/lib/python/jobserver.py b/tools/lib/python/jobserver.py index 616411087725..8da1973e5c87 100755 --- a/tools/lib/python/jobserver.py +++ b/tools/lib/python/jobserver.py @@ -11,20 +11,23 @@ Interacts with the POSIX jobserver during the Kernel build time. A "normal" jobserver task, like the one initiated by a make subrocess would do: - open read/write file descriptors to communicate with the job server; - - ask for one slot by calling: + - ask for one slot by calling:: + claim = os.read(reader, 1) - - when the job finshes, call: + + - when the job finshes, call:: + os.write(writer, b"+") # os.write(writer, claim) Here, the goal is different: This script aims to get the remaining number of slots available, using all of them to run a command which handle tasks in parallel. To to that, it has a loop that ends only after there are no slots left. It then increments the number by one, in order to allow a -call equivalent to make -j$((claim+1)), e.g. having a parent make creating +call equivalent to ``make -j$((claim+1))``, e.g. having a parent make creating $claim child to do the actual work. The end goal here is to keep the total number of build tasks under the -limit established by the initial make -j$n_proc call. +limit established by the initial ``make -j$n_proc`` call. See: https://www.gnu.org/software/make/manual/html_node/POSIX-Jobserver.html#POSIX-Jobserver @@ -40,13 +43,14 @@ class JobserverExec: Claim all slots from make using POSIX Jobserver. The main methods here are: + - open(): reserves all slots; - close(): method returns all used slots back to make; - - run(): executes a command setting PARALLELISM= + - run(): executes a command setting PARALLELISM=. """ def __init__(self): - """Initialize internal vars""" + """Initialize internal vars.""" self.claim = 0 self.jobs = b"" self.reader = None @@ -54,7 +58,7 @@ class JobserverExec: self.is_open = False def open(self): - """Reserve all available slots to be claimed later on""" + """Reserve all available slots to be claimed later on.""" if self.is_open: return @@ -118,7 +122,7 @@ class JobserverExec: self.is_open = True def close(self): - """Return all reserved slots to Jobserver""" + """Return all reserved slots to Jobserver.""" if not self.is_open: return -- cgit v1.2.3