diff options
Diffstat (limited to 'poky/bitbake/lib/bb/pysh/pyshlex.py')
-rw-r--r-- | poky/bitbake/lib/bb/pysh/pyshlex.py | 888 |
1 files changed, 888 insertions, 0 deletions
diff --git a/poky/bitbake/lib/bb/pysh/pyshlex.py b/poky/bitbake/lib/bb/pysh/pyshlex.py new file mode 100644 index 000000000..fbf094b7a --- /dev/null +++ b/poky/bitbake/lib/bb/pysh/pyshlex.py @@ -0,0 +1,888 @@ +# pyshlex.py - PLY compatible lexer for pysh. +# +# Copyright 2007 Patrick Mezard +# +# This software may be used and distributed according to the terms +# of the GNU General Public License, incorporated herein by reference. + +# TODO: +# - review all "char in 'abc'" snippets: the empty string can be matched +# - test line continuations within quoted/expansion strings +# - eof is buggy wrt sublexers +# - the lexer cannot really work in pull mode as it would be required to run +# PLY in pull mode. It was designed to work incrementally and it would not be +# that hard to enable pull mode. +import re +try: + s = set() + del s +except NameError: + from Set import Set as set + +from ply import lex +from bb.pysh.sherrors import * + +class NeedMore(Exception): + pass + +def is_blank(c): + return c in (' ', '\t') + +_RE_DIGITS = re.compile(r'^\d+$') + +def are_digits(s): + return _RE_DIGITS.search(s) is not None + +_OPERATORS = dict([ + ('&&', 'AND_IF'), + ('||', 'OR_IF'), + (';;', 'DSEMI'), + ('<<', 'DLESS'), + ('>>', 'DGREAT'), + ('<&', 'LESSAND'), + ('>&', 'GREATAND'), + ('<>', 'LESSGREAT'), + ('<<-', 'DLESSDASH'), + ('>|', 'CLOBBER'), + ('&', 'AMP'), + (';', 'COMMA'), + ('<', 'LESS'), + ('>', 'GREATER'), + ('(', 'LPARENS'), + (')', 'RPARENS'), +]) + +#Make a function to silence pychecker "Local variable shadows global" +def make_partial_ops(): + partials = {} + for k in _OPERATORS: + for i in range(1, len(k)+1): + partials[k[:i]] = None + return partials + +_PARTIAL_OPERATORS = make_partial_ops() + +def is_partial_op(s): + """Return True if s matches a non-empty subpart of an operator starting + at its first character. + """ + return s in _PARTIAL_OPERATORS + +def is_op(s): + """If s matches an operator, returns the operator identifier. Return None + otherwise. + """ + return _OPERATORS.get(s) + +_RESERVEDS = dict([ + ('if', 'If'), + ('then', 'Then'), + ('else', 'Else'), + ('elif', 'Elif'), + ('fi', 'Fi'), + ('do', 'Do'), + ('done', 'Done'), + ('case', 'Case'), + ('esac', 'Esac'), + ('while', 'While'), + ('until', 'Until'), + ('for', 'For'), + ('{', 'Lbrace'), + ('}', 'Rbrace'), + ('!', 'Bang'), + ('in', 'In'), + ('|', 'PIPE'), +]) + +def get_reserved(s): + return _RESERVEDS.get(s) + +_RE_NAME = re.compile(r'^[0-9a-zA-Z_]+$') + +def is_name(s): + return _RE_NAME.search(s) is not None + +def find_chars(seq, chars): + for i,v in enumerate(seq): + if v in chars: + return i,v + return -1, None + +class WordLexer: + """WordLexer parse quoted or expansion expressions and return an expression + tree. The input string can be any well formed sequence beginning with quoting + or expansion character. Embedded expressions are handled recursively. The + resulting tree is made of lists and strings. Lists represent quoted or + expansion expressions. Each list first element is the opening separator, + the last one the closing separator. In-between can be any number of strings + or lists for sub-expressions. Non quoted/expansion expression can written as + strings or as lists with empty strings as starting and ending delimiters. + """ + + NAME_CHARSET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_' + NAME_CHARSET = dict(zip(NAME_CHARSET, NAME_CHARSET)) + + SPECIAL_CHARSET = '@*#?-$!0' + + #Characters which can be escaped depends on the current delimiters + ESCAPABLE = { + '`': set(['$', '\\', '`']), + '"': set(['$', '\\', '`', '"']), + "'": set(), + } + + def __init__(self, heredoc = False): + # _buffer is the unprocessed input characters buffer + self._buffer = [] + # _stack is empty or contains a quoted list being processed + # (this is the DFS path to the quoted expression being evaluated). + self._stack = [] + self._escapable = None + # True when parsing unquoted here documents + self._heredoc = heredoc + + def add(self, data, eof=False): + """Feed the lexer with more data. If the quoted expression can be + delimited, return a tuple (expr, remaining) containing the expression + tree and the unconsumed data. + Otherwise, raise NeedMore. + """ + self._buffer += list(data) + self._parse(eof) + + result = self._stack[0] + remaining = ''.join(self._buffer) + self._stack = [] + self._buffer = [] + return result, remaining + + def _is_escapable(self, c, delim=None): + if delim is None: + if self._heredoc: + # Backslashes works as if they were double quoted in unquoted + # here-documents + delim = '"' + else: + if len(self._stack)<=1: + return True + delim = self._stack[-2][0] + + escapables = self.ESCAPABLE.get(delim, None) + return escapables is None or c in escapables + + def _parse_squote(self, buf, result, eof): + if not buf: + raise NeedMore() + try: + pos = buf.index("'") + except ValueError: + raise NeedMore() + result[-1] += ''.join(buf[:pos]) + result += ["'"] + return pos+1, True + + def _parse_bquote(self, buf, result, eof): + if not buf: + raise NeedMore() + + if buf[0]=='\n': + #Remove line continuations + result[:] = ['', '', ''] + elif self._is_escapable(buf[0]): + result[-1] += buf[0] + result += [''] + else: + #Keep as such + result[:] = ['', '\\'+buf[0], ''] + + return 1, True + + def _parse_dquote(self, buf, result, eof): + if not buf: + raise NeedMore() + pos, sep = find_chars(buf, '$\\`"') + if pos==-1: + raise NeedMore() + + result[-1] += ''.join(buf[:pos]) + if sep=='"': + result += ['"'] + return pos+1, True + else: + #Keep everything until the separator and defer processing + return pos, False + + def _parse_command(self, buf, result, eof): + if not buf: + raise NeedMore() + + chars = '$\\`"\'' + if result[0] == '$(': + chars += ')' + pos, sep = find_chars(buf, chars) + if pos == -1: + raise NeedMore() + + result[-1] += ''.join(buf[:pos]) + if (result[0]=='$(' and sep==')') or (result[0]=='`' and sep=='`'): + result += [sep] + return pos+1, True + else: + return pos, False + + def _parse_parameter(self, buf, result, eof): + if not buf: + raise NeedMore() + + pos, sep = find_chars(buf, '$\\`"\'}') + if pos==-1: + raise NeedMore() + + result[-1] += ''.join(buf[:pos]) + if sep=='}': + result += [sep] + return pos+1, True + else: + return pos, False + + def _parse_dollar(self, buf, result, eof): + sep = result[0] + if sep=='$': + if not buf: + #TODO: handle empty $ + raise NeedMore() + if buf[0]=='(': + if len(buf)==1: + raise NeedMore() + + if buf[1]=='(': + result[0] = '$((' + buf[:2] = [] + else: + result[0] = '$(' + buf[:1] = [] + + elif buf[0]=='{': + result[0] = '${' + buf[:1] = [] + else: + if buf[0] in self.SPECIAL_CHARSET: + result[-1] = buf[0] + read = 1 + else: + for read,c in enumerate(buf): + if c not in self.NAME_CHARSET: + break + else: + if not eof: + raise NeedMore() + read += 1 + + result[-1] += ''.join(buf[0:read]) + + if not result[-1]: + result[:] = ['', result[0], ''] + else: + result += [''] + return read,True + + sep = result[0] + if sep=='$(': + parsefunc = self._parse_command + elif sep=='${': + parsefunc = self._parse_parameter + else: + raise NotImplementedError(sep) + + pos, closed = parsefunc(buf, result, eof) + return pos, closed + + def _parse(self, eof): + buf = self._buffer + stack = self._stack + recurse = False + + while 1: + if not stack or recurse: + if not buf: + raise NeedMore() + if buf[0] not in ('"\\`$\''): + raise ShellSyntaxError('Invalid quoted string sequence') + stack.append([buf[0], '']) + buf[:1] = [] + recurse = False + + result = stack[-1] + if result[0]=="'": + parsefunc = self._parse_squote + elif result[0]=='\\': + parsefunc = self._parse_bquote + elif result[0]=='"': + parsefunc = self._parse_dquote + elif result[0]=='`': + parsefunc = self._parse_command + elif result[0][0]=='$': + parsefunc = self._parse_dollar + else: + raise NotImplementedError() + + read, closed = parsefunc(buf, result, eof) + + buf[:read] = [] + if closed: + if len(stack)>1: + #Merge in parent expression + parsed = stack.pop() + stack[-1] += [parsed] + stack[-1] += [''] + else: + break + else: + recurse = True + +def normalize_wordtree(wtree): + """Fold back every literal sequence (delimited with empty strings) into + parent sequence. + """ + def normalize(wtree): + result = [] + for part in wtree[1:-1]: + if isinstance(part, list): + part = normalize(part) + if part[0]=='': + #Move the part content back at current level + result += part[1:-1] + continue + elif not part: + #Remove empty strings + continue + result.append(part) + if not result: + result = [''] + return [wtree[0]] + result + [wtree[-1]] + + return normalize(wtree) + + +def make_wordtree(token, here_document=False): + """Parse a delimited token and return a tree similar to the ones returned by + WordLexer. token may contain any combinations of expansion/quoted fields and + non-ones. + """ + tree = [''] + remaining = token + delimiters = '\\$`' + if not here_document: + delimiters += '\'"' + + while 1: + pos, sep = find_chars(remaining, delimiters) + if pos==-1: + tree += [remaining, ''] + return normalize_wordtree(tree) + tree.append(remaining[:pos]) + remaining = remaining[pos:] + + try: + result, remaining = WordLexer(heredoc = here_document).add(remaining, True) + except NeedMore: + raise ShellSyntaxError('Invalid token "%s"') + tree.append(result) + + +def wordtree_as_string(wtree): + """Rewrite an expression tree generated by make_wordtree as string.""" + def visit(node, output): + for child in node: + if isinstance(child, list): + visit(child, output) + else: + output.append(child) + + output = [] + visit(wtree, output) + return ''.join(output) + + +def unquote_wordtree(wtree): + """Fold the word tree while removing quotes everywhere. Other expansion + sequences are joined as such. + """ + def unquote(wtree): + unquoted = [] + if wtree[0] in ('', "'", '"', '\\'): + wtree = wtree[1:-1] + + for part in wtree: + if isinstance(part, list): + part = unquote(part) + unquoted.append(part) + return ''.join(unquoted) + + return unquote(wtree) + + +class HereDocLexer: + """HereDocLexer delimits whatever comes from the here-document starting newline + not included to the closing delimiter line included. + """ + def __init__(self, op, delim): + assert op in ('<<', '<<-') + if not delim: + raise ShellSyntaxError('invalid here document delimiter %s' % str(delim)) + + self._op = op + self._delim = delim + self._buffer = [] + self._token = [] + + def add(self, data, eof): + """If the here-document was delimited, return a tuple (content, remaining). + Raise NeedMore() otherwise. + """ + self._buffer += list(data) + self._parse(eof) + token = ''.join(self._token) + remaining = ''.join(self._buffer) + self._token, self._remaining = [], [] + return token, remaining + + def _parse(self, eof): + while 1: + #Look for first unescaped newline. Quotes may be ignored + escaped = False + for i,c in enumerate(self._buffer): + if escaped: + escaped = False + elif c=='\\': + escaped = True + elif c=='\n': + break + else: + i = -1 + + if i==-1 or self._buffer[i]!='\n': + if not eof: + raise NeedMore() + #No more data, maybe the last line is closing delimiter + line = ''.join(self._buffer) + eol = '' + self._buffer[:] = [] + else: + line = ''.join(self._buffer[:i]) + eol = self._buffer[i] + self._buffer[:i+1] = [] + + if self._op=='<<-': + line = line.lstrip('\t') + + if line==self._delim: + break + + self._token += [line, eol] + if i==-1: + break + +class Token: + #TODO: check this is still in use + OPERATOR = 'OPERATOR' + WORD = 'WORD' + + def __init__(self): + self.value = '' + self.type = None + + def __getitem__(self, key): + #Behave like a two elements tuple + if key==0: + return self.type + if key==1: + return self.value + raise IndexError(key) + + +class HereDoc: + def __init__(self, op, name=None): + self.op = op + self.name = name + self.pendings = [] + +TK_COMMA = 'COMMA' +TK_AMPERSAND = 'AMP' +TK_OP = 'OP' +TK_TOKEN = 'TOKEN' +TK_COMMENT = 'COMMENT' +TK_NEWLINE = 'NEWLINE' +TK_IONUMBER = 'IO_NUMBER' +TK_ASSIGNMENT = 'ASSIGNMENT_WORD' +TK_HERENAME = 'HERENAME' + +class Lexer: + """Main lexer. + + Call add() until the script AST is returned. + """ + # Here-document handling makes the whole thing more complex because they basically + # force tokens to be reordered: here-content must come right after the operator + # and the here-document name, while some other tokens might be following the + # here-document expression on the same line. + # + # So, here-doc states are basically: + # *self._state==ST_NORMAL + # - self._heredoc.op is None: no here-document + # - self._heredoc.op is not None but name is: here-document operator matched, + # waiting for the document name/delimiter + # - self._heredoc.op and name are not None: here-document is ready, following + # tokens are being stored and will be pushed again when the document is + # completely parsed. + # *self._state==ST_HEREDOC + # - The here-document is being delimited by self._herelexer. Once it is done + # the content is pushed in front of the pending token list then all these + # tokens are pushed once again. + ST_NORMAL = 'ST_NORMAL' + ST_OP = 'ST_OP' + ST_BACKSLASH = 'ST_BACKSLASH' + ST_QUOTED = 'ST_QUOTED' + ST_COMMENT = 'ST_COMMENT' + ST_HEREDOC = 'ST_HEREDOC' + + #Match end of backquote strings + RE_BACKQUOTE_END = re.compile(r'(?<!\\)(`)') + + def __init__(self, parent_state = None): + self._input = [] + self._pos = 0 + + self._token = '' + self._type = TK_TOKEN + + self._state = self.ST_NORMAL + self._parent_state = parent_state + self._wordlexer = None + + self._heredoc = HereDoc(None) + self._herelexer = None + + ### Following attributes are not used for delimiting token and can safely + ### be changed after here-document detection (see _push_toke) + + # Count the number of tokens following a 'For' reserved word. Needed to + # return an 'In' reserved word if it comes in third place. + self._for_count = None + + def add(self, data, eof=False): + """Feed the lexer with data. + + When eof is set to True, returns unconsumed data or raise if the lexer + is in the middle of a delimiting operation. + Raise NeedMore otherwise. + """ + self._input += list(data) + self._parse(eof) + self._input[:self._pos] = [] + return ''.join(self._input) + + def _parse(self, eof): + while self._state: + if self._pos>=len(self._input): + if not eof: + raise NeedMore() + elif self._state not in (self.ST_OP, self.ST_QUOTED, self.ST_HEREDOC): + #Delimit the current token and leave cleanly + self._push_token('') + break + else: + #Let the sublexer handle the eof themselves + pass + + if self._state==self.ST_NORMAL: + self._parse_normal() + elif self._state==self.ST_COMMENT: + self._parse_comment() + elif self._state==self.ST_OP: + self._parse_op(eof) + elif self._state==self.ST_QUOTED: + self._parse_quoted(eof) + elif self._state==self.ST_HEREDOC: + self._parse_heredoc(eof) + else: + assert False, "Unknown state " + str(self._state) + + if self._heredoc.op is not None: + raise ShellSyntaxError('missing here-document delimiter') + + def _parse_normal(self): + c = self._input[self._pos] + if c=='\n': + self._push_token(c) + self._token = c + self._type = TK_NEWLINE + self._push_token('') + self._pos += 1 + elif c in ('\\', '\'', '"', '`', '$'): + self._state = self.ST_QUOTED + elif is_partial_op(c): + self._push_token(c) + + self._type = TK_OP + self._token += c + self._pos += 1 + self._state = self.ST_OP + elif is_blank(c): + self._push_token(c) + + #Discard blanks + self._pos += 1 + elif self._token: + self._token += c + self._pos += 1 + elif c=='#': + self._state = self.ST_COMMENT + self._type = TK_COMMENT + self._pos += 1 + else: + self._pos += 1 + self._token += c + + def _parse_op(self, eof): + assert self._token + + while 1: + if self._pos>=len(self._input): + if not eof: + raise NeedMore() + c = '' + else: + c = self._input[self._pos] + + op = self._token + c + if c and is_partial_op(op): + #Still parsing an operator + self._token = op + self._pos += 1 + else: + #End of operator + self._push_token(c) + self._state = self.ST_NORMAL + break + + def _parse_comment(self): + while 1: + if self._pos>=len(self._input): + raise NeedMore() + + c = self._input[self._pos] + if c=='\n': + #End of comment, do not consume the end of line + self._state = self.ST_NORMAL + break + else: + self._token += c + self._pos += 1 + + def _parse_quoted(self, eof): + """Precondition: the starting backquote/dollar is still in the input queue.""" + if not self._wordlexer: + self._wordlexer = WordLexer() + + if self._pos<len(self._input): + #Transfer input queue character into the subparser + input = self._input[self._pos:] + self._pos += len(input) + + wtree, remaining = self._wordlexer.add(input, eof) + self._wordlexer = None + self._token += wordtree_as_string(wtree) + + #Put unparsed character back in the input queue + if remaining: + self._input[self._pos:self._pos] = list(remaining) + self._state = self.ST_NORMAL + + def _parse_heredoc(self, eof): + assert not self._token + + if self._herelexer is None: + self._herelexer = HereDocLexer(self._heredoc.op, self._heredoc.name) + + if self._pos<len(self._input): + #Transfer input queue character into the subparser + input = self._input[self._pos:] + self._pos += len(input) + + self._token, remaining = self._herelexer.add(input, eof) + + #Reset here-document state + self._herelexer = None + heredoc, self._heredoc = self._heredoc, HereDoc(None) + if remaining: + self._input[self._pos:self._pos] = list(remaining) + self._state = self.ST_NORMAL + + #Push pending tokens + heredoc.pendings[:0] = [(self._token, self._type, heredoc.name)] + for token, type, delim in heredoc.pendings: + self._token = token + self._type = type + self._push_token(delim) + + def _push_token(self, delim): + if not self._token: + return 0 + + if self._heredoc.op is not None: + if self._heredoc.name is None: + #Here-document name + if self._type!=TK_TOKEN: + raise ShellSyntaxError("expecting here-document name, got '%s'" % self._token) + self._heredoc.name = unquote_wordtree(make_wordtree(self._token)) + self._type = TK_HERENAME + else: + #Capture all tokens until the newline starting the here-document + if self._type==TK_NEWLINE: + assert self._state==self.ST_NORMAL + self._state = self.ST_HEREDOC + + self._heredoc.pendings.append((self._token, self._type, delim)) + self._token = '' + self._type = TK_TOKEN + return 1 + + # BEWARE: do not change parser state from here to the end of the function: + # when parsing between an here-document operator to the end of the line + # tokens are stored in self._heredoc.pendings. Therefore, they will not + # reach the section below. + + #Check operators + if self._type==TK_OP: + #False positive because of partial op matching + op = is_op(self._token) + if not op: + self._type = TK_TOKEN + else: + #Map to the specific operator + self._type = op + if self._token in ('<<', '<<-'): + #Done here rather than in _parse_op because there is no need + #to change the parser state since we are still waiting for + #the here-document name + if self._heredoc.op is not None: + raise ShellSyntaxError("syntax error near token '%s'" % self._token) + assert self._heredoc.op is None + self._heredoc.op = self._token + + if self._type==TK_TOKEN: + if '=' in self._token and not delim: + if self._token.startswith('='): + #Token is a WORD... a TOKEN that is. + pass + else: + prev = self._token[:self._token.find('=')] + if is_name(prev): + self._type = TK_ASSIGNMENT + else: + #Just a token (unspecified) + pass + else: + reserved = get_reserved(self._token) + if reserved is not None: + if reserved=='In' and self._for_count!=2: + #Sorry, not a reserved word after all + pass + else: + self._type = reserved + if reserved in ('For', 'Case'): + self._for_count = 0 + elif are_digits(self._token) and delim in ('<', '>'): + #Detect IO_NUMBER + self._type = TK_IONUMBER + elif self._token==';': + self._type = TK_COMMA + elif self._token=='&': + self._type = TK_AMPERSAND + elif self._type==TK_COMMENT: + #Comments are not part of sh grammar, ignore them + self._token = '' + self._type = TK_TOKEN + return 0 + + if self._for_count is not None: + #Track token count in 'For' expression to detect 'In' reserved words. + #Can only be in third position, no need to go beyond + self._for_count += 1 + if self._for_count==3: + self._for_count = None + + self.on_token((self._token, self._type)) + self._token = '' + self._type = TK_TOKEN + return 1 + + def on_token(self, token): + raise NotImplementedError + + +tokens = [ + TK_TOKEN, +# To silence yacc unused token warnings +# TK_COMMENT, + TK_NEWLINE, + TK_IONUMBER, + TK_ASSIGNMENT, + TK_HERENAME, +] + +#Add specific operators +tokens += _OPERATORS.values() +#Add reserved words +tokens += _RESERVEDS.values() + +class PLYLexer(Lexer): + """Bridge Lexer and PLY lexer interface.""" + def __init__(self): + Lexer.__init__(self) + self._tokens = [] + self._current = 0 + self.lineno = 0 + + def on_token(self, token): + value, type = token + + self.lineno = 0 + t = lex.LexToken() + t.value = value + t.type = type + t.lexer = self + t.lexpos = 0 + t.lineno = 0 + + self._tokens.append(t) + + def is_empty(self): + return not bool(self._tokens) + + #PLY compliant interface + def token(self): + if self._current>=len(self._tokens): + return None + t = self._tokens[self._current] + self._current += 1 + return t + + +def get_tokens(s): + """Parse the input string and return a tuple (tokens, unprocessed) where + tokens is a list of parsed tokens and unprocessed is the part of the input + string left untouched by the lexer. + """ + lexer = PLYLexer() + untouched = lexer.add(s, True) + tokens = [] + while 1: + token = lexer.token() + if token is None: + break + tokens.append(token) + + tokens = [(t.value, t.type) for t in tokens] + return tokens, untouched |