summaryrefslogtreecommitdiff
path: root/poky/bitbake/lib/bb/pysh/pyshlex.py
diff options
context:
space:
mode:
Diffstat (limited to 'poky/bitbake/lib/bb/pysh/pyshlex.py')
-rw-r--r--poky/bitbake/lib/bb/pysh/pyshlex.py888
1 files changed, 888 insertions, 0 deletions
diff --git a/poky/bitbake/lib/bb/pysh/pyshlex.py b/poky/bitbake/lib/bb/pysh/pyshlex.py
new file mode 100644
index 000000000..fbf094b7a
--- /dev/null
+++ b/poky/bitbake/lib/bb/pysh/pyshlex.py
@@ -0,0 +1,888 @@
+# pyshlex.py - PLY compatible lexer for pysh.
+#
+# Copyright 2007 Patrick Mezard
+#
+# This software may be used and distributed according to the terms
+# of the GNU General Public License, incorporated herein by reference.
+
+# TODO:
+# - review all "char in 'abc'" snippets: the empty string can be matched
+# - test line continuations within quoted/expansion strings
+# - eof is buggy wrt sublexers
+# - the lexer cannot really work in pull mode as it would be required to run
+# PLY in pull mode. It was designed to work incrementally and it would not be
+# that hard to enable pull mode.
+import re
+try:
+ s = set()
+ del s
+except NameError:
+ from Set import Set as set
+
+from ply import lex
+from bb.pysh.sherrors import *
+
+class NeedMore(Exception):
+ pass
+
+def is_blank(c):
+ return c in (' ', '\t')
+
+_RE_DIGITS = re.compile(r'^\d+$')
+
+def are_digits(s):
+ return _RE_DIGITS.search(s) is not None
+
+_OPERATORS = dict([
+ ('&&', 'AND_IF'),
+ ('||', 'OR_IF'),
+ (';;', 'DSEMI'),
+ ('<<', 'DLESS'),
+ ('>>', 'DGREAT'),
+ ('<&', 'LESSAND'),
+ ('>&', 'GREATAND'),
+ ('<>', 'LESSGREAT'),
+ ('<<-', 'DLESSDASH'),
+ ('>|', 'CLOBBER'),
+ ('&', 'AMP'),
+ (';', 'COMMA'),
+ ('<', 'LESS'),
+ ('>', 'GREATER'),
+ ('(', 'LPARENS'),
+ (')', 'RPARENS'),
+])
+
+#Make a function to silence pychecker "Local variable shadows global"
+def make_partial_ops():
+ partials = {}
+ for k in _OPERATORS:
+ for i in range(1, len(k)+1):
+ partials[k[:i]] = None
+ return partials
+
+_PARTIAL_OPERATORS = make_partial_ops()
+
+def is_partial_op(s):
+ """Return True if s matches a non-empty subpart of an operator starting
+ at its first character.
+ """
+ return s in _PARTIAL_OPERATORS
+
+def is_op(s):
+ """If s matches an operator, returns the operator identifier. Return None
+ otherwise.
+ """
+ return _OPERATORS.get(s)
+
+_RESERVEDS = dict([
+ ('if', 'If'),
+ ('then', 'Then'),
+ ('else', 'Else'),
+ ('elif', 'Elif'),
+ ('fi', 'Fi'),
+ ('do', 'Do'),
+ ('done', 'Done'),
+ ('case', 'Case'),
+ ('esac', 'Esac'),
+ ('while', 'While'),
+ ('until', 'Until'),
+ ('for', 'For'),
+ ('{', 'Lbrace'),
+ ('}', 'Rbrace'),
+ ('!', 'Bang'),
+ ('in', 'In'),
+ ('|', 'PIPE'),
+])
+
+def get_reserved(s):
+ return _RESERVEDS.get(s)
+
+_RE_NAME = re.compile(r'^[0-9a-zA-Z_]+$')
+
+def is_name(s):
+ return _RE_NAME.search(s) is not None
+
+def find_chars(seq, chars):
+ for i,v in enumerate(seq):
+ if v in chars:
+ return i,v
+ return -1, None
+
+class WordLexer:
+ """WordLexer parse quoted or expansion expressions and return an expression
+ tree. The input string can be any well formed sequence beginning with quoting
+ or expansion character. Embedded expressions are handled recursively. The
+ resulting tree is made of lists and strings. Lists represent quoted or
+ expansion expressions. Each list first element is the opening separator,
+ the last one the closing separator. In-between can be any number of strings
+ or lists for sub-expressions. Non quoted/expansion expression can written as
+ strings or as lists with empty strings as starting and ending delimiters.
+ """
+
+ NAME_CHARSET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_'
+ NAME_CHARSET = dict(zip(NAME_CHARSET, NAME_CHARSET))
+
+ SPECIAL_CHARSET = '@*#?-$!0'
+
+ #Characters which can be escaped depends on the current delimiters
+ ESCAPABLE = {
+ '`': set(['$', '\\', '`']),
+ '"': set(['$', '\\', '`', '"']),
+ "'": set(),
+ }
+
+ def __init__(self, heredoc = False):
+ # _buffer is the unprocessed input characters buffer
+ self._buffer = []
+ # _stack is empty or contains a quoted list being processed
+ # (this is the DFS path to the quoted expression being evaluated).
+ self._stack = []
+ self._escapable = None
+ # True when parsing unquoted here documents
+ self._heredoc = heredoc
+
+ def add(self, data, eof=False):
+ """Feed the lexer with more data. If the quoted expression can be
+ delimited, return a tuple (expr, remaining) containing the expression
+ tree and the unconsumed data.
+ Otherwise, raise NeedMore.
+ """
+ self._buffer += list(data)
+ self._parse(eof)
+
+ result = self._stack[0]
+ remaining = ''.join(self._buffer)
+ self._stack = []
+ self._buffer = []
+ return result, remaining
+
+ def _is_escapable(self, c, delim=None):
+ if delim is None:
+ if self._heredoc:
+ # Backslashes works as if they were double quoted in unquoted
+ # here-documents
+ delim = '"'
+ else:
+ if len(self._stack)<=1:
+ return True
+ delim = self._stack[-2][0]
+
+ escapables = self.ESCAPABLE.get(delim, None)
+ return escapables is None or c in escapables
+
+ def _parse_squote(self, buf, result, eof):
+ if not buf:
+ raise NeedMore()
+ try:
+ pos = buf.index("'")
+ except ValueError:
+ raise NeedMore()
+ result[-1] += ''.join(buf[:pos])
+ result += ["'"]
+ return pos+1, True
+
+ def _parse_bquote(self, buf, result, eof):
+ if not buf:
+ raise NeedMore()
+
+ if buf[0]=='\n':
+ #Remove line continuations
+ result[:] = ['', '', '']
+ elif self._is_escapable(buf[0]):
+ result[-1] += buf[0]
+ result += ['']
+ else:
+ #Keep as such
+ result[:] = ['', '\\'+buf[0], '']
+
+ return 1, True
+
+ def _parse_dquote(self, buf, result, eof):
+ if not buf:
+ raise NeedMore()
+ pos, sep = find_chars(buf, '$\\`"')
+ if pos==-1:
+ raise NeedMore()
+
+ result[-1] += ''.join(buf[:pos])
+ if sep=='"':
+ result += ['"']
+ return pos+1, True
+ else:
+ #Keep everything until the separator and defer processing
+ return pos, False
+
+ def _parse_command(self, buf, result, eof):
+ if not buf:
+ raise NeedMore()
+
+ chars = '$\\`"\''
+ if result[0] == '$(':
+ chars += ')'
+ pos, sep = find_chars(buf, chars)
+ if pos == -1:
+ raise NeedMore()
+
+ result[-1] += ''.join(buf[:pos])
+ if (result[0]=='$(' and sep==')') or (result[0]=='`' and sep=='`'):
+ result += [sep]
+ return pos+1, True
+ else:
+ return pos, False
+
+ def _parse_parameter(self, buf, result, eof):
+ if not buf:
+ raise NeedMore()
+
+ pos, sep = find_chars(buf, '$\\`"\'}')
+ if pos==-1:
+ raise NeedMore()
+
+ result[-1] += ''.join(buf[:pos])
+ if sep=='}':
+ result += [sep]
+ return pos+1, True
+ else:
+ return pos, False
+
+ def _parse_dollar(self, buf, result, eof):
+ sep = result[0]
+ if sep=='$':
+ if not buf:
+ #TODO: handle empty $
+ raise NeedMore()
+ if buf[0]=='(':
+ if len(buf)==1:
+ raise NeedMore()
+
+ if buf[1]=='(':
+ result[0] = '$(('
+ buf[:2] = []
+ else:
+ result[0] = '$('
+ buf[:1] = []
+
+ elif buf[0]=='{':
+ result[0] = '${'
+ buf[:1] = []
+ else:
+ if buf[0] in self.SPECIAL_CHARSET:
+ result[-1] = buf[0]
+ read = 1
+ else:
+ for read,c in enumerate(buf):
+ if c not in self.NAME_CHARSET:
+ break
+ else:
+ if not eof:
+ raise NeedMore()
+ read += 1
+
+ result[-1] += ''.join(buf[0:read])
+
+ if not result[-1]:
+ result[:] = ['', result[0], '']
+ else:
+ result += ['']
+ return read,True
+
+ sep = result[0]
+ if sep=='$(':
+ parsefunc = self._parse_command
+ elif sep=='${':
+ parsefunc = self._parse_parameter
+ else:
+ raise NotImplementedError(sep)
+
+ pos, closed = parsefunc(buf, result, eof)
+ return pos, closed
+
+ def _parse(self, eof):
+ buf = self._buffer
+ stack = self._stack
+ recurse = False
+
+ while 1:
+ if not stack or recurse:
+ if not buf:
+ raise NeedMore()
+ if buf[0] not in ('"\\`$\''):
+ raise ShellSyntaxError('Invalid quoted string sequence')
+ stack.append([buf[0], ''])
+ buf[:1] = []
+ recurse = False
+
+ result = stack[-1]
+ if result[0]=="'":
+ parsefunc = self._parse_squote
+ elif result[0]=='\\':
+ parsefunc = self._parse_bquote
+ elif result[0]=='"':
+ parsefunc = self._parse_dquote
+ elif result[0]=='`':
+ parsefunc = self._parse_command
+ elif result[0][0]=='$':
+ parsefunc = self._parse_dollar
+ else:
+ raise NotImplementedError()
+
+ read, closed = parsefunc(buf, result, eof)
+
+ buf[:read] = []
+ if closed:
+ if len(stack)>1:
+ #Merge in parent expression
+ parsed = stack.pop()
+ stack[-1] += [parsed]
+ stack[-1] += ['']
+ else:
+ break
+ else:
+ recurse = True
+
+def normalize_wordtree(wtree):
+ """Fold back every literal sequence (delimited with empty strings) into
+ parent sequence.
+ """
+ def normalize(wtree):
+ result = []
+ for part in wtree[1:-1]:
+ if isinstance(part, list):
+ part = normalize(part)
+ if part[0]=='':
+ #Move the part content back at current level
+ result += part[1:-1]
+ continue
+ elif not part:
+ #Remove empty strings
+ continue
+ result.append(part)
+ if not result:
+ result = ['']
+ return [wtree[0]] + result + [wtree[-1]]
+
+ return normalize(wtree)
+
+
+def make_wordtree(token, here_document=False):
+ """Parse a delimited token and return a tree similar to the ones returned by
+ WordLexer. token may contain any combinations of expansion/quoted fields and
+ non-ones.
+ """
+ tree = ['']
+ remaining = token
+ delimiters = '\\$`'
+ if not here_document:
+ delimiters += '\'"'
+
+ while 1:
+ pos, sep = find_chars(remaining, delimiters)
+ if pos==-1:
+ tree += [remaining, '']
+ return normalize_wordtree(tree)
+ tree.append(remaining[:pos])
+ remaining = remaining[pos:]
+
+ try:
+ result, remaining = WordLexer(heredoc = here_document).add(remaining, True)
+ except NeedMore:
+ raise ShellSyntaxError('Invalid token "%s"')
+ tree.append(result)
+
+
+def wordtree_as_string(wtree):
+ """Rewrite an expression tree generated by make_wordtree as string."""
+ def visit(node, output):
+ for child in node:
+ if isinstance(child, list):
+ visit(child, output)
+ else:
+ output.append(child)
+
+ output = []
+ visit(wtree, output)
+ return ''.join(output)
+
+
+def unquote_wordtree(wtree):
+ """Fold the word tree while removing quotes everywhere. Other expansion
+ sequences are joined as such.
+ """
+ def unquote(wtree):
+ unquoted = []
+ if wtree[0] in ('', "'", '"', '\\'):
+ wtree = wtree[1:-1]
+
+ for part in wtree:
+ if isinstance(part, list):
+ part = unquote(part)
+ unquoted.append(part)
+ return ''.join(unquoted)
+
+ return unquote(wtree)
+
+
+class HereDocLexer:
+ """HereDocLexer delimits whatever comes from the here-document starting newline
+ not included to the closing delimiter line included.
+ """
+ def __init__(self, op, delim):
+ assert op in ('<<', '<<-')
+ if not delim:
+ raise ShellSyntaxError('invalid here document delimiter %s' % str(delim))
+
+ self._op = op
+ self._delim = delim
+ self._buffer = []
+ self._token = []
+
+ def add(self, data, eof):
+ """If the here-document was delimited, return a tuple (content, remaining).
+ Raise NeedMore() otherwise.
+ """
+ self._buffer += list(data)
+ self._parse(eof)
+ token = ''.join(self._token)
+ remaining = ''.join(self._buffer)
+ self._token, self._remaining = [], []
+ return token, remaining
+
+ def _parse(self, eof):
+ while 1:
+ #Look for first unescaped newline. Quotes may be ignored
+ escaped = False
+ for i,c in enumerate(self._buffer):
+ if escaped:
+ escaped = False
+ elif c=='\\':
+ escaped = True
+ elif c=='\n':
+ break
+ else:
+ i = -1
+
+ if i==-1 or self._buffer[i]!='\n':
+ if not eof:
+ raise NeedMore()
+ #No more data, maybe the last line is closing delimiter
+ line = ''.join(self._buffer)
+ eol = ''
+ self._buffer[:] = []
+ else:
+ line = ''.join(self._buffer[:i])
+ eol = self._buffer[i]
+ self._buffer[:i+1] = []
+
+ if self._op=='<<-':
+ line = line.lstrip('\t')
+
+ if line==self._delim:
+ break
+
+ self._token += [line, eol]
+ if i==-1:
+ break
+
+class Token:
+ #TODO: check this is still in use
+ OPERATOR = 'OPERATOR'
+ WORD = 'WORD'
+
+ def __init__(self):
+ self.value = ''
+ self.type = None
+
+ def __getitem__(self, key):
+ #Behave like a two elements tuple
+ if key==0:
+ return self.type
+ if key==1:
+ return self.value
+ raise IndexError(key)
+
+
+class HereDoc:
+ def __init__(self, op, name=None):
+ self.op = op
+ self.name = name
+ self.pendings = []
+
+TK_COMMA = 'COMMA'
+TK_AMPERSAND = 'AMP'
+TK_OP = 'OP'
+TK_TOKEN = 'TOKEN'
+TK_COMMENT = 'COMMENT'
+TK_NEWLINE = 'NEWLINE'
+TK_IONUMBER = 'IO_NUMBER'
+TK_ASSIGNMENT = 'ASSIGNMENT_WORD'
+TK_HERENAME = 'HERENAME'
+
+class Lexer:
+ """Main lexer.
+
+ Call add() until the script AST is returned.
+ """
+ # Here-document handling makes the whole thing more complex because they basically
+ # force tokens to be reordered: here-content must come right after the operator
+ # and the here-document name, while some other tokens might be following the
+ # here-document expression on the same line.
+ #
+ # So, here-doc states are basically:
+ # *self._state==ST_NORMAL
+ # - self._heredoc.op is None: no here-document
+ # - self._heredoc.op is not None but name is: here-document operator matched,
+ # waiting for the document name/delimiter
+ # - self._heredoc.op and name are not None: here-document is ready, following
+ # tokens are being stored and will be pushed again when the document is
+ # completely parsed.
+ # *self._state==ST_HEREDOC
+ # - The here-document is being delimited by self._herelexer. Once it is done
+ # the content is pushed in front of the pending token list then all these
+ # tokens are pushed once again.
+ ST_NORMAL = 'ST_NORMAL'
+ ST_OP = 'ST_OP'
+ ST_BACKSLASH = 'ST_BACKSLASH'
+ ST_QUOTED = 'ST_QUOTED'
+ ST_COMMENT = 'ST_COMMENT'
+ ST_HEREDOC = 'ST_HEREDOC'
+
+ #Match end of backquote strings
+ RE_BACKQUOTE_END = re.compile(r'(?<!\\)(`)')
+
+ def __init__(self, parent_state = None):
+ self._input = []
+ self._pos = 0
+
+ self._token = ''
+ self._type = TK_TOKEN
+
+ self._state = self.ST_NORMAL
+ self._parent_state = parent_state
+ self._wordlexer = None
+
+ self._heredoc = HereDoc(None)
+ self._herelexer = None
+
+ ### Following attributes are not used for delimiting token and can safely
+ ### be changed after here-document detection (see _push_toke)
+
+ # Count the number of tokens following a 'For' reserved word. Needed to
+ # return an 'In' reserved word if it comes in third place.
+ self._for_count = None
+
+ def add(self, data, eof=False):
+ """Feed the lexer with data.
+
+ When eof is set to True, returns unconsumed data or raise if the lexer
+ is in the middle of a delimiting operation.
+ Raise NeedMore otherwise.
+ """
+ self._input += list(data)
+ self._parse(eof)
+ self._input[:self._pos] = []
+ return ''.join(self._input)
+
+ def _parse(self, eof):
+ while self._state:
+ if self._pos>=len(self._input):
+ if not eof:
+ raise NeedMore()
+ elif self._state not in (self.ST_OP, self.ST_QUOTED, self.ST_HEREDOC):
+ #Delimit the current token and leave cleanly
+ self._push_token('')
+ break
+ else:
+ #Let the sublexer handle the eof themselves
+ pass
+
+ if self._state==self.ST_NORMAL:
+ self._parse_normal()
+ elif self._state==self.ST_COMMENT:
+ self._parse_comment()
+ elif self._state==self.ST_OP:
+ self._parse_op(eof)
+ elif self._state==self.ST_QUOTED:
+ self._parse_quoted(eof)
+ elif self._state==self.ST_HEREDOC:
+ self._parse_heredoc(eof)
+ else:
+ assert False, "Unknown state " + str(self._state)
+
+ if self._heredoc.op is not None:
+ raise ShellSyntaxError('missing here-document delimiter')
+
+ def _parse_normal(self):
+ c = self._input[self._pos]
+ if c=='\n':
+ self._push_token(c)
+ self._token = c
+ self._type = TK_NEWLINE
+ self._push_token('')
+ self._pos += 1
+ elif c in ('\\', '\'', '"', '`', '$'):
+ self._state = self.ST_QUOTED
+ elif is_partial_op(c):
+ self._push_token(c)
+
+ self._type = TK_OP
+ self._token += c
+ self._pos += 1
+ self._state = self.ST_OP
+ elif is_blank(c):
+ self._push_token(c)
+
+ #Discard blanks
+ self._pos += 1
+ elif self._token:
+ self._token += c
+ self._pos += 1
+ elif c=='#':
+ self._state = self.ST_COMMENT
+ self._type = TK_COMMENT
+ self._pos += 1
+ else:
+ self._pos += 1
+ self._token += c
+
+ def _parse_op(self, eof):
+ assert self._token
+
+ while 1:
+ if self._pos>=len(self._input):
+ if not eof:
+ raise NeedMore()
+ c = ''
+ else:
+ c = self._input[self._pos]
+
+ op = self._token + c
+ if c and is_partial_op(op):
+ #Still parsing an operator
+ self._token = op
+ self._pos += 1
+ else:
+ #End of operator
+ self._push_token(c)
+ self._state = self.ST_NORMAL
+ break
+
+ def _parse_comment(self):
+ while 1:
+ if self._pos>=len(self._input):
+ raise NeedMore()
+
+ c = self._input[self._pos]
+ if c=='\n':
+ #End of comment, do not consume the end of line
+ self._state = self.ST_NORMAL
+ break
+ else:
+ self._token += c
+ self._pos += 1
+
+ def _parse_quoted(self, eof):
+ """Precondition: the starting backquote/dollar is still in the input queue."""
+ if not self._wordlexer:
+ self._wordlexer = WordLexer()
+
+ if self._pos<len(self._input):
+ #Transfer input queue character into the subparser
+ input = self._input[self._pos:]
+ self._pos += len(input)
+
+ wtree, remaining = self._wordlexer.add(input, eof)
+ self._wordlexer = None
+ self._token += wordtree_as_string(wtree)
+
+ #Put unparsed character back in the input queue
+ if remaining:
+ self._input[self._pos:self._pos] = list(remaining)
+ self._state = self.ST_NORMAL
+
+ def _parse_heredoc(self, eof):
+ assert not self._token
+
+ if self._herelexer is None:
+ self._herelexer = HereDocLexer(self._heredoc.op, self._heredoc.name)
+
+ if self._pos<len(self._input):
+ #Transfer input queue character into the subparser
+ input = self._input[self._pos:]
+ self._pos += len(input)
+
+ self._token, remaining = self._herelexer.add(input, eof)
+
+ #Reset here-document state
+ self._herelexer = None
+ heredoc, self._heredoc = self._heredoc, HereDoc(None)
+ if remaining:
+ self._input[self._pos:self._pos] = list(remaining)
+ self._state = self.ST_NORMAL
+
+ #Push pending tokens
+ heredoc.pendings[:0] = [(self._token, self._type, heredoc.name)]
+ for token, type, delim in heredoc.pendings:
+ self._token = token
+ self._type = type
+ self._push_token(delim)
+
+ def _push_token(self, delim):
+ if not self._token:
+ return 0
+
+ if self._heredoc.op is not None:
+ if self._heredoc.name is None:
+ #Here-document name
+ if self._type!=TK_TOKEN:
+ raise ShellSyntaxError("expecting here-document name, got '%s'" % self._token)
+ self._heredoc.name = unquote_wordtree(make_wordtree(self._token))
+ self._type = TK_HERENAME
+ else:
+ #Capture all tokens until the newline starting the here-document
+ if self._type==TK_NEWLINE:
+ assert self._state==self.ST_NORMAL
+ self._state = self.ST_HEREDOC
+
+ self._heredoc.pendings.append((self._token, self._type, delim))
+ self._token = ''
+ self._type = TK_TOKEN
+ return 1
+
+ # BEWARE: do not change parser state from here to the end of the function:
+ # when parsing between an here-document operator to the end of the line
+ # tokens are stored in self._heredoc.pendings. Therefore, they will not
+ # reach the section below.
+
+ #Check operators
+ if self._type==TK_OP:
+ #False positive because of partial op matching
+ op = is_op(self._token)
+ if not op:
+ self._type = TK_TOKEN
+ else:
+ #Map to the specific operator
+ self._type = op
+ if self._token in ('<<', '<<-'):
+ #Done here rather than in _parse_op because there is no need
+ #to change the parser state since we are still waiting for
+ #the here-document name
+ if self._heredoc.op is not None:
+ raise ShellSyntaxError("syntax error near token '%s'" % self._token)
+ assert self._heredoc.op is None
+ self._heredoc.op = self._token
+
+ if self._type==TK_TOKEN:
+ if '=' in self._token and not delim:
+ if self._token.startswith('='):
+ #Token is a WORD... a TOKEN that is.
+ pass
+ else:
+ prev = self._token[:self._token.find('=')]
+ if is_name(prev):
+ self._type = TK_ASSIGNMENT
+ else:
+ #Just a token (unspecified)
+ pass
+ else:
+ reserved = get_reserved(self._token)
+ if reserved is not None:
+ if reserved=='In' and self._for_count!=2:
+ #Sorry, not a reserved word after all
+ pass
+ else:
+ self._type = reserved
+ if reserved in ('For', 'Case'):
+ self._for_count = 0
+ elif are_digits(self._token) and delim in ('<', '>'):
+ #Detect IO_NUMBER
+ self._type = TK_IONUMBER
+ elif self._token==';':
+ self._type = TK_COMMA
+ elif self._token=='&':
+ self._type = TK_AMPERSAND
+ elif self._type==TK_COMMENT:
+ #Comments are not part of sh grammar, ignore them
+ self._token = ''
+ self._type = TK_TOKEN
+ return 0
+
+ if self._for_count is not None:
+ #Track token count in 'For' expression to detect 'In' reserved words.
+ #Can only be in third position, no need to go beyond
+ self._for_count += 1
+ if self._for_count==3:
+ self._for_count = None
+
+ self.on_token((self._token, self._type))
+ self._token = ''
+ self._type = TK_TOKEN
+ return 1
+
+ def on_token(self, token):
+ raise NotImplementedError
+
+
+tokens = [
+ TK_TOKEN,
+# To silence yacc unused token warnings
+# TK_COMMENT,
+ TK_NEWLINE,
+ TK_IONUMBER,
+ TK_ASSIGNMENT,
+ TK_HERENAME,
+]
+
+#Add specific operators
+tokens += _OPERATORS.values()
+#Add reserved words
+tokens += _RESERVEDS.values()
+
+class PLYLexer(Lexer):
+ """Bridge Lexer and PLY lexer interface."""
+ def __init__(self):
+ Lexer.__init__(self)
+ self._tokens = []
+ self._current = 0
+ self.lineno = 0
+
+ def on_token(self, token):
+ value, type = token
+
+ self.lineno = 0
+ t = lex.LexToken()
+ t.value = value
+ t.type = type
+ t.lexer = self
+ t.lexpos = 0
+ t.lineno = 0
+
+ self._tokens.append(t)
+
+ def is_empty(self):
+ return not bool(self._tokens)
+
+ #PLY compliant interface
+ def token(self):
+ if self._current>=len(self._tokens):
+ return None
+ t = self._tokens[self._current]
+ self._current += 1
+ return t
+
+
+def get_tokens(s):
+ """Parse the input string and return a tuple (tokens, unprocessed) where
+ tokens is a list of parsed tokens and unprocessed is the part of the input
+ string left untouched by the lexer.
+ """
+ lexer = PLYLexer()
+ untouched = lexer.add(s, True)
+ tokens = []
+ while 1:
+ token = lexer.token()
+ if token is None:
+ break
+ tokens.append(token)
+
+ tokens = [(t.value, t.type) for t in tokens]
+ return tokens, untouched