xref: /OK3568_Linux_fs/yocto/poky/bitbake/lib/bb/pysh/pyshlex.py (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun# pyshlex.py - PLY compatible lexer for pysh.
2*4882a593Smuzhiyun#
3*4882a593Smuzhiyun# Copyright 2007 Patrick Mezard
4*4882a593Smuzhiyun#
5*4882a593Smuzhiyun# This software may be used and distributed according to the terms
6*4882a593Smuzhiyun# of the GNU General Public License, incorporated herein by reference.
7*4882a593Smuzhiyun
8*4882a593Smuzhiyun# TODO:
9*4882a593Smuzhiyun# - review all "char in 'abc'" snippets: the empty string can be matched
10*4882a593Smuzhiyun# - test line continuations within quoted/expansion strings
11*4882a593Smuzhiyun# - eof is buggy wrt sublexers
12*4882a593Smuzhiyun# - the lexer cannot really work in pull mode as it would be required to run
13*4882a593Smuzhiyun# PLY in pull mode. It was designed to work incrementally and it would not be
14*4882a593Smuzhiyun# that hard to enable pull mode.
15*4882a593Smuzhiyunimport re
16*4882a593Smuzhiyun
17*4882a593Smuzhiyunfrom ply import lex
18*4882a593Smuzhiyunfrom bb.pysh.sherrors import *
19*4882a593Smuzhiyun
20*4882a593Smuzhiyunclass NeedMore(Exception):
21*4882a593Smuzhiyun    pass
22*4882a593Smuzhiyun
23*4882a593Smuzhiyundef is_blank(c):
24*4882a593Smuzhiyun    return c in (' ', '\t')
25*4882a593Smuzhiyun
26*4882a593Smuzhiyun_RE_DIGITS = re.compile(r'^\d+$')
27*4882a593Smuzhiyun
28*4882a593Smuzhiyundef are_digits(s):
29*4882a593Smuzhiyun    return _RE_DIGITS.search(s) is not None
30*4882a593Smuzhiyun
31*4882a593Smuzhiyun_OPERATORS = dict([
32*4882a593Smuzhiyun    ('&&', 'AND_IF'),
33*4882a593Smuzhiyun    ('||', 'OR_IF'),
34*4882a593Smuzhiyun    (';;', 'DSEMI'),
35*4882a593Smuzhiyun    ('<<', 'DLESS'),
36*4882a593Smuzhiyun    ('>>', 'DGREAT'),
37*4882a593Smuzhiyun    ('<&', 'LESSAND'),
38*4882a593Smuzhiyun    ('>&', 'GREATAND'),
39*4882a593Smuzhiyun    ('<>', 'LESSGREAT'),
40*4882a593Smuzhiyun    ('<<-', 'DLESSDASH'),
41*4882a593Smuzhiyun    ('>|', 'CLOBBER'),
42*4882a593Smuzhiyun    ('&', 'AMP'),
43*4882a593Smuzhiyun    (';', 'COMMA'),
44*4882a593Smuzhiyun    ('<', 'LESS'),
45*4882a593Smuzhiyun    ('>', 'GREATER'),
46*4882a593Smuzhiyun    ('(', 'LPARENS'),
47*4882a593Smuzhiyun    (')', 'RPARENS'),
48*4882a593Smuzhiyun])
49*4882a593Smuzhiyun
50*4882a593Smuzhiyun#Make a function to silence pychecker "Local variable shadows global"
51*4882a593Smuzhiyundef make_partial_ops():
52*4882a593Smuzhiyun    partials = {}
53*4882a593Smuzhiyun    for k in _OPERATORS:
54*4882a593Smuzhiyun        for i in range(1, len(k)+1):
55*4882a593Smuzhiyun            partials[k[:i]] = None
56*4882a593Smuzhiyun    return partials
57*4882a593Smuzhiyun
58*4882a593Smuzhiyun_PARTIAL_OPERATORS = make_partial_ops()
59*4882a593Smuzhiyun
60*4882a593Smuzhiyundef is_partial_op(s):
61*4882a593Smuzhiyun    """Return True if s matches a non-empty subpart of an operator starting
62*4882a593Smuzhiyun    at its first character.
63*4882a593Smuzhiyun    """
64*4882a593Smuzhiyun    return s in _PARTIAL_OPERATORS
65*4882a593Smuzhiyun
66*4882a593Smuzhiyundef is_op(s):
67*4882a593Smuzhiyun    """If s matches an operator, returns the operator identifier. Return None
68*4882a593Smuzhiyun    otherwise.
69*4882a593Smuzhiyun    """
70*4882a593Smuzhiyun    return _OPERATORS.get(s)
71*4882a593Smuzhiyun
72*4882a593Smuzhiyun_RESERVEDS = dict([
73*4882a593Smuzhiyun    ('if', 'If'),
74*4882a593Smuzhiyun    ('then', 'Then'),
75*4882a593Smuzhiyun    ('else', 'Else'),
76*4882a593Smuzhiyun    ('elif', 'Elif'),
77*4882a593Smuzhiyun    ('fi', 'Fi'),
78*4882a593Smuzhiyun    ('do', 'Do'),
79*4882a593Smuzhiyun    ('done', 'Done'),
80*4882a593Smuzhiyun    ('case', 'Case'),
81*4882a593Smuzhiyun    ('esac', 'Esac'),
82*4882a593Smuzhiyun    ('while', 'While'),
83*4882a593Smuzhiyun    ('until', 'Until'),
84*4882a593Smuzhiyun    ('for', 'For'),
85*4882a593Smuzhiyun    ('{', 'Lbrace'),
86*4882a593Smuzhiyun    ('}', 'Rbrace'),
87*4882a593Smuzhiyun    ('!', 'Bang'),
88*4882a593Smuzhiyun    ('in', 'In'),
89*4882a593Smuzhiyun    ('|', 'PIPE'),
90*4882a593Smuzhiyun])
91*4882a593Smuzhiyun
92*4882a593Smuzhiyundef get_reserved(s):
93*4882a593Smuzhiyun    return _RESERVEDS.get(s)
94*4882a593Smuzhiyun
95*4882a593Smuzhiyun_RE_NAME = re.compile(r'^[0-9a-zA-Z_]+$')
96*4882a593Smuzhiyun
97*4882a593Smuzhiyundef is_name(s):
98*4882a593Smuzhiyun    return _RE_NAME.search(s) is not None
99*4882a593Smuzhiyun
100*4882a593Smuzhiyundef find_chars(seq, chars):
101*4882a593Smuzhiyun    for i,v in enumerate(seq):
102*4882a593Smuzhiyun        if v in chars:
103*4882a593Smuzhiyun            return i,v
104*4882a593Smuzhiyun    return -1, None
105*4882a593Smuzhiyun
106*4882a593Smuzhiyunclass WordLexer:
107*4882a593Smuzhiyun    """WordLexer parse quoted or expansion expressions and return an expression
108*4882a593Smuzhiyun    tree. The input string can be any well formed sequence beginning with quoting
109*4882a593Smuzhiyun    or expansion character. Embedded expressions are handled recursively. The
110*4882a593Smuzhiyun    resulting tree is made of lists and strings. Lists represent quoted or
111*4882a593Smuzhiyun    expansion expressions. Each list first element is the opening separator,
112*4882a593Smuzhiyun    the last one the closing separator. In-between can be any number of strings
113*4882a593Smuzhiyun    or lists for sub-expressions. Non quoted/expansion expression can written as
114*4882a593Smuzhiyun    strings or as lists with empty strings as starting and ending delimiters.
115*4882a593Smuzhiyun    """
116*4882a593Smuzhiyun
117*4882a593Smuzhiyun    NAME_CHARSET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_'
118*4882a593Smuzhiyun    NAME_CHARSET = dict(zip(NAME_CHARSET, NAME_CHARSET))
119*4882a593Smuzhiyun
120*4882a593Smuzhiyun    SPECIAL_CHARSET = '@*#?-$!0'
121*4882a593Smuzhiyun
122*4882a593Smuzhiyun    #Characters which can be escaped depends on the current delimiters
123*4882a593Smuzhiyun    ESCAPABLE = {
124*4882a593Smuzhiyun        '`': set(['$', '\\', '`']),
125*4882a593Smuzhiyun        '"': set(['$', '\\', '`', '"']),
126*4882a593Smuzhiyun        "'": set(),
127*4882a593Smuzhiyun    }
128*4882a593Smuzhiyun
129*4882a593Smuzhiyun    def __init__(self, heredoc = False):
130*4882a593Smuzhiyun        # _buffer is the unprocessed input characters buffer
131*4882a593Smuzhiyun        self._buffer = []
132*4882a593Smuzhiyun        # _stack is empty or contains a quoted list being processed
133*4882a593Smuzhiyun        # (this is the DFS path to the quoted expression being evaluated).
134*4882a593Smuzhiyun        self._stack = []
135*4882a593Smuzhiyun        self._escapable = None
136*4882a593Smuzhiyun        # True when parsing unquoted here documents
137*4882a593Smuzhiyun        self._heredoc = heredoc
138*4882a593Smuzhiyun
139*4882a593Smuzhiyun    def add(self, data, eof=False):
140*4882a593Smuzhiyun        """Feed the lexer with more data. If the quoted expression can be
141*4882a593Smuzhiyun        delimited, return a tuple (expr, remaining) containing the expression
142*4882a593Smuzhiyun        tree and the unconsumed data.
143*4882a593Smuzhiyun        Otherwise, raise NeedMore.
144*4882a593Smuzhiyun        """
145*4882a593Smuzhiyun        self._buffer += list(data)
146*4882a593Smuzhiyun        self._parse(eof)
147*4882a593Smuzhiyun
148*4882a593Smuzhiyun        result = self._stack[0]
149*4882a593Smuzhiyun        remaining = ''.join(self._buffer)
150*4882a593Smuzhiyun        self._stack = []
151*4882a593Smuzhiyun        self._buffer = []
152*4882a593Smuzhiyun        return result, remaining
153*4882a593Smuzhiyun
154*4882a593Smuzhiyun    def _is_escapable(self, c, delim=None):
155*4882a593Smuzhiyun        if delim is None:
156*4882a593Smuzhiyun            if self._heredoc:
157*4882a593Smuzhiyun                # Backslashes works as if they were double quoted in unquoted
158*4882a593Smuzhiyun                # here-documents
159*4882a593Smuzhiyun                delim = '"'
160*4882a593Smuzhiyun            else:
161*4882a593Smuzhiyun                if len(self._stack)<=1:
162*4882a593Smuzhiyun                    return True
163*4882a593Smuzhiyun                delim = self._stack[-2][0]
164*4882a593Smuzhiyun
165*4882a593Smuzhiyun        escapables = self.ESCAPABLE.get(delim, None)
166*4882a593Smuzhiyun        return escapables is None or c in escapables
167*4882a593Smuzhiyun
168*4882a593Smuzhiyun    def _parse_squote(self, buf, result, eof):
169*4882a593Smuzhiyun        if not buf:
170*4882a593Smuzhiyun            raise NeedMore()
171*4882a593Smuzhiyun        try:
172*4882a593Smuzhiyun            pos = buf.index("'")
173*4882a593Smuzhiyun        except ValueError:
174*4882a593Smuzhiyun            raise NeedMore()
175*4882a593Smuzhiyun        result[-1] += ''.join(buf[:pos])
176*4882a593Smuzhiyun        result += ["'"]
177*4882a593Smuzhiyun        return pos+1, True
178*4882a593Smuzhiyun
179*4882a593Smuzhiyun    def _parse_bquote(self, buf, result, eof):
180*4882a593Smuzhiyun        if not buf:
181*4882a593Smuzhiyun            raise NeedMore()
182*4882a593Smuzhiyun
183*4882a593Smuzhiyun        if buf[0]=='\n':
184*4882a593Smuzhiyun            #Remove line continuations
185*4882a593Smuzhiyun            result[:] = ['', '', '']
186*4882a593Smuzhiyun        elif self._is_escapable(buf[0]):
187*4882a593Smuzhiyun            result[-1] += buf[0]
188*4882a593Smuzhiyun            result += ['']
189*4882a593Smuzhiyun        else:
190*4882a593Smuzhiyun            #Keep as such
191*4882a593Smuzhiyun            result[:] = ['', '\\'+buf[0], '']
192*4882a593Smuzhiyun
193*4882a593Smuzhiyun        return 1, True
194*4882a593Smuzhiyun
195*4882a593Smuzhiyun    def _parse_dquote(self, buf, result, eof):
196*4882a593Smuzhiyun        if not buf:
197*4882a593Smuzhiyun            raise NeedMore()
198*4882a593Smuzhiyun        pos, sep = find_chars(buf, '$\\`"')
199*4882a593Smuzhiyun        if pos==-1:
200*4882a593Smuzhiyun            raise NeedMore()
201*4882a593Smuzhiyun
202*4882a593Smuzhiyun        result[-1] += ''.join(buf[:pos])
203*4882a593Smuzhiyun        if sep=='"':
204*4882a593Smuzhiyun            result += ['"']
205*4882a593Smuzhiyun            return pos+1, True
206*4882a593Smuzhiyun        else:
207*4882a593Smuzhiyun            #Keep everything until the separator and defer processing
208*4882a593Smuzhiyun            return pos, False
209*4882a593Smuzhiyun
210*4882a593Smuzhiyun    def _parse_command(self, buf, result, eof):
211*4882a593Smuzhiyun        if not buf:
212*4882a593Smuzhiyun            raise NeedMore()
213*4882a593Smuzhiyun
214*4882a593Smuzhiyun        chars = '$\\`"\''
215*4882a593Smuzhiyun        if result[0] == '$(':
216*4882a593Smuzhiyun            chars += ')'
217*4882a593Smuzhiyun        pos, sep = find_chars(buf, chars)
218*4882a593Smuzhiyun        if pos == -1:
219*4882a593Smuzhiyun            raise NeedMore()
220*4882a593Smuzhiyun
221*4882a593Smuzhiyun        result[-1] += ''.join(buf[:pos])
222*4882a593Smuzhiyun        if (result[0]=='$(' and sep==')') or (result[0]=='`' and sep=='`'):
223*4882a593Smuzhiyun            result += [sep]
224*4882a593Smuzhiyun            return pos+1, True
225*4882a593Smuzhiyun        else:
226*4882a593Smuzhiyun            return pos, False
227*4882a593Smuzhiyun
228*4882a593Smuzhiyun    def _parse_parameter(self, buf, result, eof):
229*4882a593Smuzhiyun        if not buf:
230*4882a593Smuzhiyun            raise NeedMore()
231*4882a593Smuzhiyun
232*4882a593Smuzhiyun        pos, sep = find_chars(buf, '$\\`"\'}')
233*4882a593Smuzhiyun        if pos==-1:
234*4882a593Smuzhiyun            raise NeedMore()
235*4882a593Smuzhiyun
236*4882a593Smuzhiyun        result[-1] += ''.join(buf[:pos])
237*4882a593Smuzhiyun        if sep=='}':
238*4882a593Smuzhiyun            result += [sep]
239*4882a593Smuzhiyun            return pos+1, True
240*4882a593Smuzhiyun        else:
241*4882a593Smuzhiyun            return pos, False
242*4882a593Smuzhiyun
243*4882a593Smuzhiyun    def _parse_dollar(self, buf, result, eof):
244*4882a593Smuzhiyun        sep = result[0]
245*4882a593Smuzhiyun        if sep=='$':
246*4882a593Smuzhiyun            if not buf:
247*4882a593Smuzhiyun                #TODO: handle empty $
248*4882a593Smuzhiyun                raise NeedMore()
249*4882a593Smuzhiyun            if buf[0]=='(':
250*4882a593Smuzhiyun                if len(buf)==1:
251*4882a593Smuzhiyun                    raise NeedMore()
252*4882a593Smuzhiyun
253*4882a593Smuzhiyun                if buf[1]=='(':
254*4882a593Smuzhiyun                    result[0] = '$(('
255*4882a593Smuzhiyun                    buf[:2] = []
256*4882a593Smuzhiyun                else:
257*4882a593Smuzhiyun                    result[0] = '$('
258*4882a593Smuzhiyun                    buf[:1] = []
259*4882a593Smuzhiyun
260*4882a593Smuzhiyun            elif buf[0]=='{':
261*4882a593Smuzhiyun                result[0] = '${'
262*4882a593Smuzhiyun                buf[:1] = []
263*4882a593Smuzhiyun            else:
264*4882a593Smuzhiyun                if buf[0] in self.SPECIAL_CHARSET:
265*4882a593Smuzhiyun                    result[-1] = buf[0]
266*4882a593Smuzhiyun                    read = 1
267*4882a593Smuzhiyun                else:
268*4882a593Smuzhiyun                    for read,c in enumerate(buf):
269*4882a593Smuzhiyun                        if c not in self.NAME_CHARSET:
270*4882a593Smuzhiyun                            break
271*4882a593Smuzhiyun                    else:
272*4882a593Smuzhiyun                        if not eof:
273*4882a593Smuzhiyun                            raise NeedMore()
274*4882a593Smuzhiyun                        read += 1
275*4882a593Smuzhiyun
276*4882a593Smuzhiyun                    result[-1] += ''.join(buf[0:read])
277*4882a593Smuzhiyun
278*4882a593Smuzhiyun                if not result[-1]:
279*4882a593Smuzhiyun                    result[:] = ['', result[0], '']
280*4882a593Smuzhiyun                else:
281*4882a593Smuzhiyun                    result += ['']
282*4882a593Smuzhiyun                return read,True
283*4882a593Smuzhiyun
284*4882a593Smuzhiyun        sep = result[0]
285*4882a593Smuzhiyun        if sep=='$(':
286*4882a593Smuzhiyun            parsefunc = self._parse_command
287*4882a593Smuzhiyun        elif sep=='${':
288*4882a593Smuzhiyun            parsefunc = self._parse_parameter
289*4882a593Smuzhiyun        else:
290*4882a593Smuzhiyun            raise NotImplementedError(sep)
291*4882a593Smuzhiyun
292*4882a593Smuzhiyun        pos, closed = parsefunc(buf, result, eof)
293*4882a593Smuzhiyun        return pos, closed
294*4882a593Smuzhiyun
295*4882a593Smuzhiyun    def _parse(self, eof):
296*4882a593Smuzhiyun        buf = self._buffer
297*4882a593Smuzhiyun        stack = self._stack
298*4882a593Smuzhiyun        recurse = False
299*4882a593Smuzhiyun
300*4882a593Smuzhiyun        while 1:
301*4882a593Smuzhiyun            if not stack or recurse:
302*4882a593Smuzhiyun                if not buf:
303*4882a593Smuzhiyun                    raise NeedMore()
304*4882a593Smuzhiyun                if buf[0] not in ('"\\`$\''):
305*4882a593Smuzhiyun                    raise ShellSyntaxError('Invalid quoted string sequence')
306*4882a593Smuzhiyun                stack.append([buf[0], ''])
307*4882a593Smuzhiyun                buf[:1] = []
308*4882a593Smuzhiyun                recurse = False
309*4882a593Smuzhiyun
310*4882a593Smuzhiyun            result = stack[-1]
311*4882a593Smuzhiyun            if result[0]=="'":
312*4882a593Smuzhiyun                parsefunc = self._parse_squote
313*4882a593Smuzhiyun            elif result[0]=='\\':
314*4882a593Smuzhiyun                parsefunc = self._parse_bquote
315*4882a593Smuzhiyun            elif result[0]=='"':
316*4882a593Smuzhiyun                parsefunc = self._parse_dquote
317*4882a593Smuzhiyun            elif result[0]=='`':
318*4882a593Smuzhiyun                parsefunc = self._parse_command
319*4882a593Smuzhiyun            elif result[0][0]=='$':
320*4882a593Smuzhiyun                parsefunc = self._parse_dollar
321*4882a593Smuzhiyun            else:
322*4882a593Smuzhiyun                raise NotImplementedError()
323*4882a593Smuzhiyun
324*4882a593Smuzhiyun            read, closed = parsefunc(buf, result, eof)
325*4882a593Smuzhiyun
326*4882a593Smuzhiyun            buf[:read] = []
327*4882a593Smuzhiyun            if closed:
328*4882a593Smuzhiyun                if len(stack)>1:
329*4882a593Smuzhiyun                    #Merge in parent expression
330*4882a593Smuzhiyun                    parsed = stack.pop()
331*4882a593Smuzhiyun                    stack[-1] += [parsed]
332*4882a593Smuzhiyun                    stack[-1] += ['']
333*4882a593Smuzhiyun                else:
334*4882a593Smuzhiyun                    break
335*4882a593Smuzhiyun            else:
336*4882a593Smuzhiyun                recurse = True
337*4882a593Smuzhiyun
338*4882a593Smuzhiyundef normalize_wordtree(wtree):
339*4882a593Smuzhiyun    """Fold back every literal sequence (delimited with empty strings) into
340*4882a593Smuzhiyun    parent sequence.
341*4882a593Smuzhiyun    """
342*4882a593Smuzhiyun    def normalize(wtree):
343*4882a593Smuzhiyun        result = []
344*4882a593Smuzhiyun        for part in wtree[1:-1]:
345*4882a593Smuzhiyun            if isinstance(part, list):
346*4882a593Smuzhiyun                part = normalize(part)
347*4882a593Smuzhiyun                if part[0]=='':
348*4882a593Smuzhiyun                    #Move the part content back at current level
349*4882a593Smuzhiyun                    result += part[1:-1]
350*4882a593Smuzhiyun                    continue
351*4882a593Smuzhiyun            elif not part:
352*4882a593Smuzhiyun                #Remove empty strings
353*4882a593Smuzhiyun                continue
354*4882a593Smuzhiyun            result.append(part)
355*4882a593Smuzhiyun        if not result:
356*4882a593Smuzhiyun            result = ['']
357*4882a593Smuzhiyun        return [wtree[0]] + result + [wtree[-1]]
358*4882a593Smuzhiyun
359*4882a593Smuzhiyun    return normalize(wtree)
360*4882a593Smuzhiyun
361*4882a593Smuzhiyun
362*4882a593Smuzhiyundef make_wordtree(token, here_document=False):
363*4882a593Smuzhiyun    """Parse a delimited token and return a tree similar to the ones returned by
364*4882a593Smuzhiyun    WordLexer. token may contain any combinations of expansion/quoted fields and
365*4882a593Smuzhiyun    non-ones.
366*4882a593Smuzhiyun    """
367*4882a593Smuzhiyun    tree = ['']
368*4882a593Smuzhiyun    remaining = token
369*4882a593Smuzhiyun    delimiters = '\\$`'
370*4882a593Smuzhiyun    if not here_document:
371*4882a593Smuzhiyun        delimiters += '\'"'
372*4882a593Smuzhiyun
373*4882a593Smuzhiyun    while 1:
374*4882a593Smuzhiyun        pos, sep = find_chars(remaining, delimiters)
375*4882a593Smuzhiyun        if pos==-1:
376*4882a593Smuzhiyun            tree += [remaining, '']
377*4882a593Smuzhiyun            return normalize_wordtree(tree)
378*4882a593Smuzhiyun        tree.append(remaining[:pos])
379*4882a593Smuzhiyun        remaining = remaining[pos:]
380*4882a593Smuzhiyun
381*4882a593Smuzhiyun        try:
382*4882a593Smuzhiyun            result, remaining = WordLexer(heredoc = here_document).add(remaining, True)
383*4882a593Smuzhiyun        except NeedMore:
384*4882a593Smuzhiyun            raise ShellSyntaxError('Invalid token "%s"')
385*4882a593Smuzhiyun        tree.append(result)
386*4882a593Smuzhiyun
387*4882a593Smuzhiyun
388*4882a593Smuzhiyundef wordtree_as_string(wtree):
389*4882a593Smuzhiyun    """Rewrite an expression tree generated by make_wordtree as string."""
390*4882a593Smuzhiyun    def visit(node, output):
391*4882a593Smuzhiyun        for child in node:
392*4882a593Smuzhiyun            if isinstance(child, list):
393*4882a593Smuzhiyun                visit(child, output)
394*4882a593Smuzhiyun            else:
395*4882a593Smuzhiyun                output.append(child)
396*4882a593Smuzhiyun
397*4882a593Smuzhiyun    output = []
398*4882a593Smuzhiyun    visit(wtree, output)
399*4882a593Smuzhiyun    return ''.join(output)
400*4882a593Smuzhiyun
401*4882a593Smuzhiyun
402*4882a593Smuzhiyundef unquote_wordtree(wtree):
403*4882a593Smuzhiyun    """Fold the word tree while removing quotes everywhere. Other expansion
404*4882a593Smuzhiyun    sequences are joined as such.
405*4882a593Smuzhiyun    """
406*4882a593Smuzhiyun    def unquote(wtree):
407*4882a593Smuzhiyun        unquoted = []
408*4882a593Smuzhiyun        if wtree[0] in ('', "'", '"', '\\'):
409*4882a593Smuzhiyun            wtree = wtree[1:-1]
410*4882a593Smuzhiyun
411*4882a593Smuzhiyun        for part in wtree:
412*4882a593Smuzhiyun            if isinstance(part, list):
413*4882a593Smuzhiyun                part = unquote(part)
414*4882a593Smuzhiyun            unquoted.append(part)
415*4882a593Smuzhiyun        return ''.join(unquoted)
416*4882a593Smuzhiyun
417*4882a593Smuzhiyun    return unquote(wtree)
418*4882a593Smuzhiyun
419*4882a593Smuzhiyun
420*4882a593Smuzhiyunclass HereDocLexer:
421*4882a593Smuzhiyun    """HereDocLexer delimits whatever comes from the here-document starting newline
422*4882a593Smuzhiyun    not included to the closing delimiter line included.
423*4882a593Smuzhiyun    """
424*4882a593Smuzhiyun    def __init__(self, op, delim):
425*4882a593Smuzhiyun        assert op in ('<<', '<<-')
426*4882a593Smuzhiyun        if not delim:
427*4882a593Smuzhiyun            raise ShellSyntaxError('invalid here document delimiter %s' % str(delim))
428*4882a593Smuzhiyun
429*4882a593Smuzhiyun        self._op = op
430*4882a593Smuzhiyun        self._delim = delim
431*4882a593Smuzhiyun        self._buffer = []
432*4882a593Smuzhiyun        self._token = []
433*4882a593Smuzhiyun
434*4882a593Smuzhiyun    def add(self, data, eof):
435*4882a593Smuzhiyun        """If the here-document was delimited, return a tuple (content, remaining).
436*4882a593Smuzhiyun        Raise NeedMore() otherwise.
437*4882a593Smuzhiyun        """
438*4882a593Smuzhiyun        self._buffer += list(data)
439*4882a593Smuzhiyun        self._parse(eof)
440*4882a593Smuzhiyun        token = ''.join(self._token)
441*4882a593Smuzhiyun        remaining = ''.join(self._buffer)
442*4882a593Smuzhiyun        self._token, self._remaining = [], []
443*4882a593Smuzhiyun        return token, remaining
444*4882a593Smuzhiyun
445*4882a593Smuzhiyun    def _parse(self, eof):
446*4882a593Smuzhiyun        while 1:
447*4882a593Smuzhiyun            #Look for first unescaped newline. Quotes may be ignored
448*4882a593Smuzhiyun            escaped = False
449*4882a593Smuzhiyun            for i,c in enumerate(self._buffer):
450*4882a593Smuzhiyun                if escaped:
451*4882a593Smuzhiyun                    escaped = False
452*4882a593Smuzhiyun                elif c=='\\':
453*4882a593Smuzhiyun                    escaped = True
454*4882a593Smuzhiyun                elif c=='\n':
455*4882a593Smuzhiyun                    break
456*4882a593Smuzhiyun            else:
457*4882a593Smuzhiyun                i = -1
458*4882a593Smuzhiyun
459*4882a593Smuzhiyun            if i==-1 or self._buffer[i]!='\n':
460*4882a593Smuzhiyun                if not eof:
461*4882a593Smuzhiyun                    raise NeedMore()
462*4882a593Smuzhiyun                #No more data, maybe the last line is closing delimiter
463*4882a593Smuzhiyun                line = ''.join(self._buffer)
464*4882a593Smuzhiyun                eol = ''
465*4882a593Smuzhiyun                self._buffer[:] = []
466*4882a593Smuzhiyun            else:
467*4882a593Smuzhiyun                line = ''.join(self._buffer[:i])
468*4882a593Smuzhiyun                eol = self._buffer[i]
469*4882a593Smuzhiyun                self._buffer[:i+1] = []
470*4882a593Smuzhiyun
471*4882a593Smuzhiyun            if self._op=='<<-':
472*4882a593Smuzhiyun                line = line.lstrip('\t')
473*4882a593Smuzhiyun
474*4882a593Smuzhiyun            if line==self._delim:
475*4882a593Smuzhiyun                break
476*4882a593Smuzhiyun
477*4882a593Smuzhiyun            self._token += [line, eol]
478*4882a593Smuzhiyun            if i==-1:
479*4882a593Smuzhiyun                break
480*4882a593Smuzhiyun
481*4882a593Smuzhiyunclass Token:
482*4882a593Smuzhiyun    #TODO: check this is still in use
483*4882a593Smuzhiyun    OPERATOR = 'OPERATOR'
484*4882a593Smuzhiyun    WORD = 'WORD'
485*4882a593Smuzhiyun
486*4882a593Smuzhiyun    def __init__(self):
487*4882a593Smuzhiyun        self.value = ''
488*4882a593Smuzhiyun        self.type = None
489*4882a593Smuzhiyun
490*4882a593Smuzhiyun    def __getitem__(self, key):
491*4882a593Smuzhiyun        #Behave like a two elements tuple
492*4882a593Smuzhiyun        if key==0:
493*4882a593Smuzhiyun            return self.type
494*4882a593Smuzhiyun        if key==1:
495*4882a593Smuzhiyun            return self.value
496*4882a593Smuzhiyun        raise IndexError(key)
497*4882a593Smuzhiyun
498*4882a593Smuzhiyun
499*4882a593Smuzhiyunclass HereDoc:
500*4882a593Smuzhiyun    def __init__(self, op, name=None):
501*4882a593Smuzhiyun        self.op = op
502*4882a593Smuzhiyun        self.name = name
503*4882a593Smuzhiyun        self.pendings = []
504*4882a593Smuzhiyun
505*4882a593SmuzhiyunTK_COMMA        = 'COMMA'
506*4882a593SmuzhiyunTK_AMPERSAND    = 'AMP'
507*4882a593SmuzhiyunTK_OP           = 'OP'
508*4882a593SmuzhiyunTK_TOKEN        = 'TOKEN'
509*4882a593SmuzhiyunTK_COMMENT      = 'COMMENT'
510*4882a593SmuzhiyunTK_NEWLINE      = 'NEWLINE'
511*4882a593SmuzhiyunTK_IONUMBER     = 'IO_NUMBER'
512*4882a593SmuzhiyunTK_ASSIGNMENT   = 'ASSIGNMENT_WORD'
513*4882a593SmuzhiyunTK_HERENAME     = 'HERENAME'
514*4882a593Smuzhiyun
515*4882a593Smuzhiyunclass Lexer:
516*4882a593Smuzhiyun    """Main lexer.
517*4882a593Smuzhiyun
518*4882a593Smuzhiyun    Call add() until the script AST is returned.
519*4882a593Smuzhiyun    """
520*4882a593Smuzhiyun    # Here-document handling makes the whole thing more complex because they basically
521*4882a593Smuzhiyun    # force tokens to be reordered: here-content must come right after the operator
522*4882a593Smuzhiyun    # and the here-document name, while some other tokens might be following the
523*4882a593Smuzhiyun    # here-document expression on the same line.
524*4882a593Smuzhiyun    #
525*4882a593Smuzhiyun    # So, here-doc states are basically:
526*4882a593Smuzhiyun    #   *self._state==ST_NORMAL
527*4882a593Smuzhiyun    #       - self._heredoc.op is None: no here-document
528*4882a593Smuzhiyun    #       - self._heredoc.op is not None but name is: here-document operator matched,
529*4882a593Smuzhiyun    #           waiting for the document name/delimiter
530*4882a593Smuzhiyun    #       - self._heredoc.op and name are not None: here-document is ready, following
531*4882a593Smuzhiyun    #           tokens are being stored and will be pushed again when the document is
532*4882a593Smuzhiyun    #           completely parsed.
533*4882a593Smuzhiyun    #   *self._state==ST_HEREDOC
534*4882a593Smuzhiyun    #       - The here-document is being delimited by self._herelexer. Once it is done
535*4882a593Smuzhiyun    #           the content is pushed in front of the pending token list then all these
536*4882a593Smuzhiyun    #           tokens are pushed once again.
537*4882a593Smuzhiyun    ST_NORMAL       = 'ST_NORMAL'
538*4882a593Smuzhiyun    ST_OP           = 'ST_OP'
539*4882a593Smuzhiyun    ST_BACKSLASH    = 'ST_BACKSLASH'
540*4882a593Smuzhiyun    ST_QUOTED       = 'ST_QUOTED'
541*4882a593Smuzhiyun    ST_COMMENT      = 'ST_COMMENT'
542*4882a593Smuzhiyun    ST_HEREDOC      = 'ST_HEREDOC'
543*4882a593Smuzhiyun
544*4882a593Smuzhiyun    #Match end of backquote strings
545*4882a593Smuzhiyun    RE_BACKQUOTE_END = re.compile(r'(?<!\\)(`)')
546*4882a593Smuzhiyun
547*4882a593Smuzhiyun    def __init__(self, parent_state = None):
548*4882a593Smuzhiyun        self._input = []
549*4882a593Smuzhiyun        self._pos = 0
550*4882a593Smuzhiyun
551*4882a593Smuzhiyun        self._token = ''
552*4882a593Smuzhiyun        self._type = TK_TOKEN
553*4882a593Smuzhiyun
554*4882a593Smuzhiyun        self._state = self.ST_NORMAL
555*4882a593Smuzhiyun        self._parent_state = parent_state
556*4882a593Smuzhiyun        self._wordlexer = None
557*4882a593Smuzhiyun
558*4882a593Smuzhiyun        self._heredoc = HereDoc(None)
559*4882a593Smuzhiyun        self._herelexer = None
560*4882a593Smuzhiyun
561*4882a593Smuzhiyun        ### Following attributes are not used for delimiting token and can safely
562*4882a593Smuzhiyun        ### be changed after here-document detection (see _push_toke)
563*4882a593Smuzhiyun
564*4882a593Smuzhiyun        # Count the number of tokens following a 'For' reserved word. Needed to
565*4882a593Smuzhiyun        # return an 'In' reserved word if it comes in third place.
566*4882a593Smuzhiyun        self._for_count = None
567*4882a593Smuzhiyun
568*4882a593Smuzhiyun    def add(self, data, eof=False):
569*4882a593Smuzhiyun        """Feed the lexer with data.
570*4882a593Smuzhiyun
571*4882a593Smuzhiyun        When eof is set to True, returns unconsumed data or raise if the lexer
572*4882a593Smuzhiyun        is in the middle of a delimiting operation.
573*4882a593Smuzhiyun        Raise NeedMore otherwise.
574*4882a593Smuzhiyun        """
575*4882a593Smuzhiyun        self._input += list(data)
576*4882a593Smuzhiyun        self._parse(eof)
577*4882a593Smuzhiyun        self._input[:self._pos] = []
578*4882a593Smuzhiyun        return ''.join(self._input)
579*4882a593Smuzhiyun
580*4882a593Smuzhiyun    def _parse(self, eof):
581*4882a593Smuzhiyun        while self._state:
582*4882a593Smuzhiyun            if self._pos>=len(self._input):
583*4882a593Smuzhiyun                if not eof:
584*4882a593Smuzhiyun                    raise NeedMore()
585*4882a593Smuzhiyun                elif self._state not in (self.ST_OP, self.ST_QUOTED, self.ST_HEREDOC):
586*4882a593Smuzhiyun                    #Delimit the current token and leave cleanly
587*4882a593Smuzhiyun                    self._push_token('')
588*4882a593Smuzhiyun                    break
589*4882a593Smuzhiyun                else:
590*4882a593Smuzhiyun                    #Let the sublexer handle the eof themselves
591*4882a593Smuzhiyun                    pass
592*4882a593Smuzhiyun
593*4882a593Smuzhiyun            if self._state==self.ST_NORMAL:
594*4882a593Smuzhiyun                self._parse_normal()
595*4882a593Smuzhiyun            elif self._state==self.ST_COMMENT:
596*4882a593Smuzhiyun                self._parse_comment()
597*4882a593Smuzhiyun            elif self._state==self.ST_OP:
598*4882a593Smuzhiyun                self._parse_op(eof)
599*4882a593Smuzhiyun            elif self._state==self.ST_QUOTED:
600*4882a593Smuzhiyun                self._parse_quoted(eof)
601*4882a593Smuzhiyun            elif self._state==self.ST_HEREDOC:
602*4882a593Smuzhiyun                self._parse_heredoc(eof)
603*4882a593Smuzhiyun            else:
604*4882a593Smuzhiyun                assert False, "Unknown state " + str(self._state)
605*4882a593Smuzhiyun
606*4882a593Smuzhiyun        if self._heredoc.op is not None:
607*4882a593Smuzhiyun            raise ShellSyntaxError('missing here-document delimiter')
608*4882a593Smuzhiyun
609*4882a593Smuzhiyun    def _parse_normal(self):
610*4882a593Smuzhiyun        c = self._input[self._pos]
611*4882a593Smuzhiyun        if c=='\n':
612*4882a593Smuzhiyun            self._push_token(c)
613*4882a593Smuzhiyun            self._token = c
614*4882a593Smuzhiyun            self._type = TK_NEWLINE
615*4882a593Smuzhiyun            self._push_token('')
616*4882a593Smuzhiyun            self._pos += 1
617*4882a593Smuzhiyun        elif c in ('\\', '\'', '"', '`', '$'):
618*4882a593Smuzhiyun            self._state = self.ST_QUOTED
619*4882a593Smuzhiyun        elif is_partial_op(c):
620*4882a593Smuzhiyun            self._push_token(c)
621*4882a593Smuzhiyun
622*4882a593Smuzhiyun            self._type = TK_OP
623*4882a593Smuzhiyun            self._token += c
624*4882a593Smuzhiyun            self._pos += 1
625*4882a593Smuzhiyun            self._state = self.ST_OP
626*4882a593Smuzhiyun        elif is_blank(c):
627*4882a593Smuzhiyun            self._push_token(c)
628*4882a593Smuzhiyun
629*4882a593Smuzhiyun            #Discard blanks
630*4882a593Smuzhiyun            self._pos += 1
631*4882a593Smuzhiyun        elif self._token:
632*4882a593Smuzhiyun            self._token += c
633*4882a593Smuzhiyun            self._pos += 1
634*4882a593Smuzhiyun        elif c=='#':
635*4882a593Smuzhiyun            self._state = self.ST_COMMENT
636*4882a593Smuzhiyun            self._type = TK_COMMENT
637*4882a593Smuzhiyun            self._pos += 1
638*4882a593Smuzhiyun        else:
639*4882a593Smuzhiyun            self._pos += 1
640*4882a593Smuzhiyun            self._token += c
641*4882a593Smuzhiyun
642*4882a593Smuzhiyun    def _parse_op(self, eof):
643*4882a593Smuzhiyun        assert self._token
644*4882a593Smuzhiyun
645*4882a593Smuzhiyun        while 1:
646*4882a593Smuzhiyun            if self._pos>=len(self._input):
647*4882a593Smuzhiyun                if not eof:
648*4882a593Smuzhiyun                    raise NeedMore()
649*4882a593Smuzhiyun                c = ''
650*4882a593Smuzhiyun            else:
651*4882a593Smuzhiyun                c = self._input[self._pos]
652*4882a593Smuzhiyun
653*4882a593Smuzhiyun            op = self._token + c
654*4882a593Smuzhiyun            if c and is_partial_op(op):
655*4882a593Smuzhiyun                #Still parsing an operator
656*4882a593Smuzhiyun                self._token = op
657*4882a593Smuzhiyun                self._pos += 1
658*4882a593Smuzhiyun            else:
659*4882a593Smuzhiyun                #End of operator
660*4882a593Smuzhiyun                self._push_token(c)
661*4882a593Smuzhiyun                self._state = self.ST_NORMAL
662*4882a593Smuzhiyun                break
663*4882a593Smuzhiyun
664*4882a593Smuzhiyun    def _parse_comment(self):
665*4882a593Smuzhiyun        while 1:
666*4882a593Smuzhiyun            if self._pos>=len(self._input):
667*4882a593Smuzhiyun                raise NeedMore()
668*4882a593Smuzhiyun
669*4882a593Smuzhiyun            c = self._input[self._pos]
670*4882a593Smuzhiyun            if c=='\n':
671*4882a593Smuzhiyun                #End of comment, do not consume the end of line
672*4882a593Smuzhiyun                self._state = self.ST_NORMAL
673*4882a593Smuzhiyun                break
674*4882a593Smuzhiyun            else:
675*4882a593Smuzhiyun                self._token += c
676*4882a593Smuzhiyun                self._pos += 1
677*4882a593Smuzhiyun
678*4882a593Smuzhiyun    def _parse_quoted(self, eof):
679*4882a593Smuzhiyun        """Precondition: the starting backquote/dollar is still in the input queue."""
680*4882a593Smuzhiyun        if not self._wordlexer:
681*4882a593Smuzhiyun            self._wordlexer = WordLexer()
682*4882a593Smuzhiyun
683*4882a593Smuzhiyun        if self._pos<len(self._input):
684*4882a593Smuzhiyun             #Transfer input queue character into the subparser
685*4882a593Smuzhiyun            input = self._input[self._pos:]
686*4882a593Smuzhiyun            self._pos += len(input)
687*4882a593Smuzhiyun
688*4882a593Smuzhiyun        wtree, remaining = self._wordlexer.add(input, eof)
689*4882a593Smuzhiyun        self._wordlexer = None
690*4882a593Smuzhiyun        self._token += wordtree_as_string(wtree)
691*4882a593Smuzhiyun
692*4882a593Smuzhiyun        #Put unparsed character back in the input queue
693*4882a593Smuzhiyun        if remaining:
694*4882a593Smuzhiyun            self._input[self._pos:self._pos] = list(remaining)
695*4882a593Smuzhiyun        self._state = self.ST_NORMAL
696*4882a593Smuzhiyun
697*4882a593Smuzhiyun    def _parse_heredoc(self, eof):
698*4882a593Smuzhiyun        assert not self._token
699*4882a593Smuzhiyun
700*4882a593Smuzhiyun        if self._herelexer is None:
701*4882a593Smuzhiyun            self._herelexer = HereDocLexer(self._heredoc.op, self._heredoc.name)
702*4882a593Smuzhiyun
703*4882a593Smuzhiyun        if self._pos<len(self._input):
704*4882a593Smuzhiyun             #Transfer input queue character into the subparser
705*4882a593Smuzhiyun            input = self._input[self._pos:]
706*4882a593Smuzhiyun            self._pos += len(input)
707*4882a593Smuzhiyun
708*4882a593Smuzhiyun        self._token, remaining = self._herelexer.add(input, eof)
709*4882a593Smuzhiyun
710*4882a593Smuzhiyun        #Reset here-document state
711*4882a593Smuzhiyun        self._herelexer = None
712*4882a593Smuzhiyun        heredoc, self._heredoc = self._heredoc, HereDoc(None)
713*4882a593Smuzhiyun        if remaining:
714*4882a593Smuzhiyun            self._input[self._pos:self._pos] = list(remaining)
715*4882a593Smuzhiyun        self._state = self.ST_NORMAL
716*4882a593Smuzhiyun
717*4882a593Smuzhiyun        #Push pending tokens
718*4882a593Smuzhiyun        heredoc.pendings[:0] = [(self._token, self._type, heredoc.name)]
719*4882a593Smuzhiyun        for token, type, delim in heredoc.pendings:
720*4882a593Smuzhiyun            self._token = token
721*4882a593Smuzhiyun            self._type = type
722*4882a593Smuzhiyun            self._push_token(delim)
723*4882a593Smuzhiyun
724*4882a593Smuzhiyun    def _push_token(self, delim):
725*4882a593Smuzhiyun        if not self._token:
726*4882a593Smuzhiyun            return 0
727*4882a593Smuzhiyun
728*4882a593Smuzhiyun        if self._heredoc.op is not None:
729*4882a593Smuzhiyun            if self._heredoc.name is None:
730*4882a593Smuzhiyun                #Here-document name
731*4882a593Smuzhiyun                if self._type!=TK_TOKEN:
732*4882a593Smuzhiyun                    raise ShellSyntaxError("expecting here-document name, got '%s'" % self._token)
733*4882a593Smuzhiyun                self._heredoc.name = unquote_wordtree(make_wordtree(self._token))
734*4882a593Smuzhiyun                self._type = TK_HERENAME
735*4882a593Smuzhiyun            else:
736*4882a593Smuzhiyun                #Capture all tokens until the newline starting the here-document
737*4882a593Smuzhiyun                if self._type==TK_NEWLINE:
738*4882a593Smuzhiyun                    assert self._state==self.ST_NORMAL
739*4882a593Smuzhiyun                    self._state = self.ST_HEREDOC
740*4882a593Smuzhiyun
741*4882a593Smuzhiyun                self._heredoc.pendings.append((self._token, self._type, delim))
742*4882a593Smuzhiyun                self._token = ''
743*4882a593Smuzhiyun                self._type = TK_TOKEN
744*4882a593Smuzhiyun                return 1
745*4882a593Smuzhiyun
746*4882a593Smuzhiyun        # BEWARE: do not change parser state from here to the end of the function:
747*4882a593Smuzhiyun        # when parsing between an here-document operator to the end of the line
748*4882a593Smuzhiyun        # tokens are stored in self._heredoc.pendings. Therefore, they will not
749*4882a593Smuzhiyun        # reach the section below.
750*4882a593Smuzhiyun
751*4882a593Smuzhiyun        #Check operators
752*4882a593Smuzhiyun        if self._type==TK_OP:
753*4882a593Smuzhiyun            #False positive because of partial op matching
754*4882a593Smuzhiyun            op = is_op(self._token)
755*4882a593Smuzhiyun            if not op:
756*4882a593Smuzhiyun                self._type = TK_TOKEN
757*4882a593Smuzhiyun            else:
758*4882a593Smuzhiyun                #Map to the specific operator
759*4882a593Smuzhiyun                self._type = op
760*4882a593Smuzhiyun                if self._token in ('<<', '<<-'):
761*4882a593Smuzhiyun                    #Done here rather than in _parse_op because there is no need
762*4882a593Smuzhiyun                    #to change the parser state since we are still waiting for
763*4882a593Smuzhiyun                    #the here-document name
764*4882a593Smuzhiyun                    if self._heredoc.op is not None:
765*4882a593Smuzhiyun                        raise ShellSyntaxError("syntax error near token '%s'" % self._token)
766*4882a593Smuzhiyun                    assert self._heredoc.op is None
767*4882a593Smuzhiyun                    self._heredoc.op = self._token
768*4882a593Smuzhiyun
769*4882a593Smuzhiyun        if self._type==TK_TOKEN:
770*4882a593Smuzhiyun            if '=' in self._token and not delim:
771*4882a593Smuzhiyun                if self._token.startswith('='):
772*4882a593Smuzhiyun                    #Token is a WORD... a TOKEN that is.
773*4882a593Smuzhiyun                    pass
774*4882a593Smuzhiyun                else:
775*4882a593Smuzhiyun                    prev = self._token[:self._token.find('=')]
776*4882a593Smuzhiyun                    if is_name(prev):
777*4882a593Smuzhiyun                        self._type = TK_ASSIGNMENT
778*4882a593Smuzhiyun                    else:
779*4882a593Smuzhiyun                        #Just a token (unspecified)
780*4882a593Smuzhiyun                        pass
781*4882a593Smuzhiyun            else:
782*4882a593Smuzhiyun                reserved = get_reserved(self._token)
783*4882a593Smuzhiyun                if reserved is not None:
784*4882a593Smuzhiyun                    if reserved=='In' and self._for_count!=2:
785*4882a593Smuzhiyun                        #Sorry, not a reserved word after all
786*4882a593Smuzhiyun                        pass
787*4882a593Smuzhiyun                    else:
788*4882a593Smuzhiyun                        self._type = reserved
789*4882a593Smuzhiyun                        if reserved in ('For', 'Case'):
790*4882a593Smuzhiyun                            self._for_count = 0
791*4882a593Smuzhiyun                elif are_digits(self._token) and delim in ('<', '>'):
792*4882a593Smuzhiyun                    #Detect IO_NUMBER
793*4882a593Smuzhiyun                    self._type = TK_IONUMBER
794*4882a593Smuzhiyun                elif self._token==';':
795*4882a593Smuzhiyun                    self._type = TK_COMMA
796*4882a593Smuzhiyun                elif self._token=='&':
797*4882a593Smuzhiyun                    self._type = TK_AMPERSAND
798*4882a593Smuzhiyun        elif self._type==TK_COMMENT:
799*4882a593Smuzhiyun            #Comments are not part of sh grammar, ignore them
800*4882a593Smuzhiyun            self._token = ''
801*4882a593Smuzhiyun            self._type = TK_TOKEN
802*4882a593Smuzhiyun            return 0
803*4882a593Smuzhiyun
804*4882a593Smuzhiyun        if self._for_count is not None:
805*4882a593Smuzhiyun            #Track token count in 'For' expression to detect 'In' reserved words.
806*4882a593Smuzhiyun            #Can only be in third position, no need to go beyond
807*4882a593Smuzhiyun            self._for_count += 1
808*4882a593Smuzhiyun            if self._for_count==3:
809*4882a593Smuzhiyun                self._for_count = None
810*4882a593Smuzhiyun
811*4882a593Smuzhiyun        self.on_token((self._token, self._type))
812*4882a593Smuzhiyun        self._token = ''
813*4882a593Smuzhiyun        self._type = TK_TOKEN
814*4882a593Smuzhiyun        return 1
815*4882a593Smuzhiyun
816*4882a593Smuzhiyun    def on_token(self, token):
817*4882a593Smuzhiyun        raise NotImplementedError
818*4882a593Smuzhiyun
819*4882a593Smuzhiyun
820*4882a593Smuzhiyuntokens = [
821*4882a593Smuzhiyun    TK_TOKEN,
822*4882a593Smuzhiyun# To silence yacc unused token warnings
823*4882a593Smuzhiyun#    TK_COMMENT,
824*4882a593Smuzhiyun    TK_NEWLINE,
825*4882a593Smuzhiyun    TK_IONUMBER,
826*4882a593Smuzhiyun    TK_ASSIGNMENT,
827*4882a593Smuzhiyun    TK_HERENAME,
828*4882a593Smuzhiyun]
829*4882a593Smuzhiyun
830*4882a593Smuzhiyun#Add specific operators
831*4882a593Smuzhiyuntokens += _OPERATORS.values()
832*4882a593Smuzhiyun#Add reserved words
833*4882a593Smuzhiyuntokens += _RESERVEDS.values()
834*4882a593Smuzhiyun
835*4882a593Smuzhiyunclass PLYLexer(Lexer):
836*4882a593Smuzhiyun    """Bridge Lexer and PLY lexer interface."""
837*4882a593Smuzhiyun    def __init__(self):
838*4882a593Smuzhiyun        Lexer.__init__(self)
839*4882a593Smuzhiyun        self._tokens = []
840*4882a593Smuzhiyun        self._current = 0
841*4882a593Smuzhiyun        self.lineno = 0
842*4882a593Smuzhiyun
843*4882a593Smuzhiyun    def on_token(self, token):
844*4882a593Smuzhiyun        value, type = token
845*4882a593Smuzhiyun
846*4882a593Smuzhiyun        self.lineno = 0
847*4882a593Smuzhiyun        t = lex.LexToken()
848*4882a593Smuzhiyun        t.value = value
849*4882a593Smuzhiyun        t.type = type
850*4882a593Smuzhiyun        t.lexer = self
851*4882a593Smuzhiyun        t.lexpos = 0
852*4882a593Smuzhiyun        t.lineno = 0
853*4882a593Smuzhiyun
854*4882a593Smuzhiyun        self._tokens.append(t)
855*4882a593Smuzhiyun
856*4882a593Smuzhiyun    def is_empty(self):
857*4882a593Smuzhiyun        return not bool(self._tokens)
858*4882a593Smuzhiyun
859*4882a593Smuzhiyun    #PLY compliant interface
860*4882a593Smuzhiyun    def token(self):
861*4882a593Smuzhiyun        if self._current>=len(self._tokens):
862*4882a593Smuzhiyun            return None
863*4882a593Smuzhiyun        t = self._tokens[self._current]
864*4882a593Smuzhiyun        self._current += 1
865*4882a593Smuzhiyun        return t
866*4882a593Smuzhiyun
867*4882a593Smuzhiyun
868*4882a593Smuzhiyundef get_tokens(s):
869*4882a593Smuzhiyun    """Parse the input string and return a tuple (tokens, unprocessed) where
870*4882a593Smuzhiyun    tokens is a list of parsed tokens and unprocessed is the part of the input
871*4882a593Smuzhiyun    string left untouched by the lexer.
872*4882a593Smuzhiyun    """
873*4882a593Smuzhiyun    lexer = PLYLexer()
874*4882a593Smuzhiyun    untouched = lexer.add(s, True)
875*4882a593Smuzhiyun    tokens = []
876*4882a593Smuzhiyun    while 1:
877*4882a593Smuzhiyun        token = lexer.token()
878*4882a593Smuzhiyun        if token is None:
879*4882a593Smuzhiyun            break
880*4882a593Smuzhiyun        tokens.append(token)
881*4882a593Smuzhiyun
882*4882a593Smuzhiyun    tokens = [(t.value, t.type) for t in tokens]
883*4882a593Smuzhiyun    return tokens, untouched
884