1*4882a593Smuzhiyun# pyshlex.py - PLY compatible lexer for pysh. 2*4882a593Smuzhiyun# 3*4882a593Smuzhiyun# Copyright 2007 Patrick Mezard 4*4882a593Smuzhiyun# 5*4882a593Smuzhiyun# This software may be used and distributed according to the terms 6*4882a593Smuzhiyun# of the GNU General Public License, incorporated herein by reference. 7*4882a593Smuzhiyun 8*4882a593Smuzhiyun# TODO: 9*4882a593Smuzhiyun# - review all "char in 'abc'" snippets: the empty string can be matched 10*4882a593Smuzhiyun# - test line continuations within quoted/expansion strings 11*4882a593Smuzhiyun# - eof is buggy wrt sublexers 12*4882a593Smuzhiyun# - the lexer cannot really work in pull mode as it would be required to run 13*4882a593Smuzhiyun# PLY in pull mode. It was designed to work incrementally and it would not be 14*4882a593Smuzhiyun# that hard to enable pull mode. 15*4882a593Smuzhiyunimport re 16*4882a593Smuzhiyun 17*4882a593Smuzhiyunfrom ply import lex 18*4882a593Smuzhiyunfrom bb.pysh.sherrors import * 19*4882a593Smuzhiyun 20*4882a593Smuzhiyunclass NeedMore(Exception): 21*4882a593Smuzhiyun pass 22*4882a593Smuzhiyun 23*4882a593Smuzhiyundef is_blank(c): 24*4882a593Smuzhiyun return c in (' ', '\t') 25*4882a593Smuzhiyun 26*4882a593Smuzhiyun_RE_DIGITS = re.compile(r'^\d+$') 27*4882a593Smuzhiyun 28*4882a593Smuzhiyundef are_digits(s): 29*4882a593Smuzhiyun return _RE_DIGITS.search(s) is not None 30*4882a593Smuzhiyun 31*4882a593Smuzhiyun_OPERATORS = dict([ 32*4882a593Smuzhiyun ('&&', 'AND_IF'), 33*4882a593Smuzhiyun ('||', 'OR_IF'), 34*4882a593Smuzhiyun (';;', 'DSEMI'), 35*4882a593Smuzhiyun ('<<', 'DLESS'), 36*4882a593Smuzhiyun ('>>', 'DGREAT'), 37*4882a593Smuzhiyun ('<&', 'LESSAND'), 38*4882a593Smuzhiyun ('>&', 'GREATAND'), 39*4882a593Smuzhiyun ('<>', 'LESSGREAT'), 40*4882a593Smuzhiyun ('<<-', 'DLESSDASH'), 41*4882a593Smuzhiyun ('>|', 'CLOBBER'), 42*4882a593Smuzhiyun ('&', 'AMP'), 43*4882a593Smuzhiyun (';', 'COMMA'), 44*4882a593Smuzhiyun ('<', 'LESS'), 45*4882a593Smuzhiyun ('>', 'GREATER'), 46*4882a593Smuzhiyun ('(', 'LPARENS'), 47*4882a593Smuzhiyun (')', 'RPARENS'), 48*4882a593Smuzhiyun]) 49*4882a593Smuzhiyun 50*4882a593Smuzhiyun#Make a function to silence pychecker "Local variable shadows global" 51*4882a593Smuzhiyundef make_partial_ops(): 52*4882a593Smuzhiyun partials = {} 53*4882a593Smuzhiyun for k in _OPERATORS: 54*4882a593Smuzhiyun for i in range(1, len(k)+1): 55*4882a593Smuzhiyun partials[k[:i]] = None 56*4882a593Smuzhiyun return partials 57*4882a593Smuzhiyun 58*4882a593Smuzhiyun_PARTIAL_OPERATORS = make_partial_ops() 59*4882a593Smuzhiyun 60*4882a593Smuzhiyundef is_partial_op(s): 61*4882a593Smuzhiyun """Return True if s matches a non-empty subpart of an operator starting 62*4882a593Smuzhiyun at its first character. 63*4882a593Smuzhiyun """ 64*4882a593Smuzhiyun return s in _PARTIAL_OPERATORS 65*4882a593Smuzhiyun 66*4882a593Smuzhiyundef is_op(s): 67*4882a593Smuzhiyun """If s matches an operator, returns the operator identifier. Return None 68*4882a593Smuzhiyun otherwise. 69*4882a593Smuzhiyun """ 70*4882a593Smuzhiyun return _OPERATORS.get(s) 71*4882a593Smuzhiyun 72*4882a593Smuzhiyun_RESERVEDS = dict([ 73*4882a593Smuzhiyun ('if', 'If'), 74*4882a593Smuzhiyun ('then', 'Then'), 75*4882a593Smuzhiyun ('else', 'Else'), 76*4882a593Smuzhiyun ('elif', 'Elif'), 77*4882a593Smuzhiyun ('fi', 'Fi'), 78*4882a593Smuzhiyun ('do', 'Do'), 79*4882a593Smuzhiyun ('done', 'Done'), 80*4882a593Smuzhiyun ('case', 'Case'), 81*4882a593Smuzhiyun ('esac', 'Esac'), 82*4882a593Smuzhiyun ('while', 'While'), 83*4882a593Smuzhiyun ('until', 'Until'), 84*4882a593Smuzhiyun ('for', 'For'), 85*4882a593Smuzhiyun ('{', 'Lbrace'), 86*4882a593Smuzhiyun ('}', 'Rbrace'), 87*4882a593Smuzhiyun ('!', 'Bang'), 88*4882a593Smuzhiyun ('in', 'In'), 89*4882a593Smuzhiyun ('|', 'PIPE'), 90*4882a593Smuzhiyun]) 91*4882a593Smuzhiyun 92*4882a593Smuzhiyundef get_reserved(s): 93*4882a593Smuzhiyun return _RESERVEDS.get(s) 94*4882a593Smuzhiyun 95*4882a593Smuzhiyun_RE_NAME = re.compile(r'^[0-9a-zA-Z_]+$') 96*4882a593Smuzhiyun 97*4882a593Smuzhiyundef is_name(s): 98*4882a593Smuzhiyun return _RE_NAME.search(s) is not None 99*4882a593Smuzhiyun 100*4882a593Smuzhiyundef find_chars(seq, chars): 101*4882a593Smuzhiyun for i,v in enumerate(seq): 102*4882a593Smuzhiyun if v in chars: 103*4882a593Smuzhiyun return i,v 104*4882a593Smuzhiyun return -1, None 105*4882a593Smuzhiyun 106*4882a593Smuzhiyunclass WordLexer: 107*4882a593Smuzhiyun """WordLexer parse quoted or expansion expressions and return an expression 108*4882a593Smuzhiyun tree. The input string can be any well formed sequence beginning with quoting 109*4882a593Smuzhiyun or expansion character. Embedded expressions are handled recursively. The 110*4882a593Smuzhiyun resulting tree is made of lists and strings. Lists represent quoted or 111*4882a593Smuzhiyun expansion expressions. Each list first element is the opening separator, 112*4882a593Smuzhiyun the last one the closing separator. In-between can be any number of strings 113*4882a593Smuzhiyun or lists for sub-expressions. Non quoted/expansion expression can written as 114*4882a593Smuzhiyun strings or as lists with empty strings as starting and ending delimiters. 115*4882a593Smuzhiyun """ 116*4882a593Smuzhiyun 117*4882a593Smuzhiyun NAME_CHARSET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_' 118*4882a593Smuzhiyun NAME_CHARSET = dict(zip(NAME_CHARSET, NAME_CHARSET)) 119*4882a593Smuzhiyun 120*4882a593Smuzhiyun SPECIAL_CHARSET = '@*#?-$!0' 121*4882a593Smuzhiyun 122*4882a593Smuzhiyun #Characters which can be escaped depends on the current delimiters 123*4882a593Smuzhiyun ESCAPABLE = { 124*4882a593Smuzhiyun '`': set(['$', '\\', '`']), 125*4882a593Smuzhiyun '"': set(['$', '\\', '`', '"']), 126*4882a593Smuzhiyun "'": set(), 127*4882a593Smuzhiyun } 128*4882a593Smuzhiyun 129*4882a593Smuzhiyun def __init__(self, heredoc = False): 130*4882a593Smuzhiyun # _buffer is the unprocessed input characters buffer 131*4882a593Smuzhiyun self._buffer = [] 132*4882a593Smuzhiyun # _stack is empty or contains a quoted list being processed 133*4882a593Smuzhiyun # (this is the DFS path to the quoted expression being evaluated). 134*4882a593Smuzhiyun self._stack = [] 135*4882a593Smuzhiyun self._escapable = None 136*4882a593Smuzhiyun # True when parsing unquoted here documents 137*4882a593Smuzhiyun self._heredoc = heredoc 138*4882a593Smuzhiyun 139*4882a593Smuzhiyun def add(self, data, eof=False): 140*4882a593Smuzhiyun """Feed the lexer with more data. If the quoted expression can be 141*4882a593Smuzhiyun delimited, return a tuple (expr, remaining) containing the expression 142*4882a593Smuzhiyun tree and the unconsumed data. 143*4882a593Smuzhiyun Otherwise, raise NeedMore. 144*4882a593Smuzhiyun """ 145*4882a593Smuzhiyun self._buffer += list(data) 146*4882a593Smuzhiyun self._parse(eof) 147*4882a593Smuzhiyun 148*4882a593Smuzhiyun result = self._stack[0] 149*4882a593Smuzhiyun remaining = ''.join(self._buffer) 150*4882a593Smuzhiyun self._stack = [] 151*4882a593Smuzhiyun self._buffer = [] 152*4882a593Smuzhiyun return result, remaining 153*4882a593Smuzhiyun 154*4882a593Smuzhiyun def _is_escapable(self, c, delim=None): 155*4882a593Smuzhiyun if delim is None: 156*4882a593Smuzhiyun if self._heredoc: 157*4882a593Smuzhiyun # Backslashes works as if they were double quoted in unquoted 158*4882a593Smuzhiyun # here-documents 159*4882a593Smuzhiyun delim = '"' 160*4882a593Smuzhiyun else: 161*4882a593Smuzhiyun if len(self._stack)<=1: 162*4882a593Smuzhiyun return True 163*4882a593Smuzhiyun delim = self._stack[-2][0] 164*4882a593Smuzhiyun 165*4882a593Smuzhiyun escapables = self.ESCAPABLE.get(delim, None) 166*4882a593Smuzhiyun return escapables is None or c in escapables 167*4882a593Smuzhiyun 168*4882a593Smuzhiyun def _parse_squote(self, buf, result, eof): 169*4882a593Smuzhiyun if not buf: 170*4882a593Smuzhiyun raise NeedMore() 171*4882a593Smuzhiyun try: 172*4882a593Smuzhiyun pos = buf.index("'") 173*4882a593Smuzhiyun except ValueError: 174*4882a593Smuzhiyun raise NeedMore() 175*4882a593Smuzhiyun result[-1] += ''.join(buf[:pos]) 176*4882a593Smuzhiyun result += ["'"] 177*4882a593Smuzhiyun return pos+1, True 178*4882a593Smuzhiyun 179*4882a593Smuzhiyun def _parse_bquote(self, buf, result, eof): 180*4882a593Smuzhiyun if not buf: 181*4882a593Smuzhiyun raise NeedMore() 182*4882a593Smuzhiyun 183*4882a593Smuzhiyun if buf[0]=='\n': 184*4882a593Smuzhiyun #Remove line continuations 185*4882a593Smuzhiyun result[:] = ['', '', ''] 186*4882a593Smuzhiyun elif self._is_escapable(buf[0]): 187*4882a593Smuzhiyun result[-1] += buf[0] 188*4882a593Smuzhiyun result += [''] 189*4882a593Smuzhiyun else: 190*4882a593Smuzhiyun #Keep as such 191*4882a593Smuzhiyun result[:] = ['', '\\'+buf[0], ''] 192*4882a593Smuzhiyun 193*4882a593Smuzhiyun return 1, True 194*4882a593Smuzhiyun 195*4882a593Smuzhiyun def _parse_dquote(self, buf, result, eof): 196*4882a593Smuzhiyun if not buf: 197*4882a593Smuzhiyun raise NeedMore() 198*4882a593Smuzhiyun pos, sep = find_chars(buf, '$\\`"') 199*4882a593Smuzhiyun if pos==-1: 200*4882a593Smuzhiyun raise NeedMore() 201*4882a593Smuzhiyun 202*4882a593Smuzhiyun result[-1] += ''.join(buf[:pos]) 203*4882a593Smuzhiyun if sep=='"': 204*4882a593Smuzhiyun result += ['"'] 205*4882a593Smuzhiyun return pos+1, True 206*4882a593Smuzhiyun else: 207*4882a593Smuzhiyun #Keep everything until the separator and defer processing 208*4882a593Smuzhiyun return pos, False 209*4882a593Smuzhiyun 210*4882a593Smuzhiyun def _parse_command(self, buf, result, eof): 211*4882a593Smuzhiyun if not buf: 212*4882a593Smuzhiyun raise NeedMore() 213*4882a593Smuzhiyun 214*4882a593Smuzhiyun chars = '$\\`"\'' 215*4882a593Smuzhiyun if result[0] == '$(': 216*4882a593Smuzhiyun chars += ')' 217*4882a593Smuzhiyun pos, sep = find_chars(buf, chars) 218*4882a593Smuzhiyun if pos == -1: 219*4882a593Smuzhiyun raise NeedMore() 220*4882a593Smuzhiyun 221*4882a593Smuzhiyun result[-1] += ''.join(buf[:pos]) 222*4882a593Smuzhiyun if (result[0]=='$(' and sep==')') or (result[0]=='`' and sep=='`'): 223*4882a593Smuzhiyun result += [sep] 224*4882a593Smuzhiyun return pos+1, True 225*4882a593Smuzhiyun else: 226*4882a593Smuzhiyun return pos, False 227*4882a593Smuzhiyun 228*4882a593Smuzhiyun def _parse_parameter(self, buf, result, eof): 229*4882a593Smuzhiyun if not buf: 230*4882a593Smuzhiyun raise NeedMore() 231*4882a593Smuzhiyun 232*4882a593Smuzhiyun pos, sep = find_chars(buf, '$\\`"\'}') 233*4882a593Smuzhiyun if pos==-1: 234*4882a593Smuzhiyun raise NeedMore() 235*4882a593Smuzhiyun 236*4882a593Smuzhiyun result[-1] += ''.join(buf[:pos]) 237*4882a593Smuzhiyun if sep=='}': 238*4882a593Smuzhiyun result += [sep] 239*4882a593Smuzhiyun return pos+1, True 240*4882a593Smuzhiyun else: 241*4882a593Smuzhiyun return pos, False 242*4882a593Smuzhiyun 243*4882a593Smuzhiyun def _parse_dollar(self, buf, result, eof): 244*4882a593Smuzhiyun sep = result[0] 245*4882a593Smuzhiyun if sep=='$': 246*4882a593Smuzhiyun if not buf: 247*4882a593Smuzhiyun #TODO: handle empty $ 248*4882a593Smuzhiyun raise NeedMore() 249*4882a593Smuzhiyun if buf[0]=='(': 250*4882a593Smuzhiyun if len(buf)==1: 251*4882a593Smuzhiyun raise NeedMore() 252*4882a593Smuzhiyun 253*4882a593Smuzhiyun if buf[1]=='(': 254*4882a593Smuzhiyun result[0] = '$((' 255*4882a593Smuzhiyun buf[:2] = [] 256*4882a593Smuzhiyun else: 257*4882a593Smuzhiyun result[0] = '$(' 258*4882a593Smuzhiyun buf[:1] = [] 259*4882a593Smuzhiyun 260*4882a593Smuzhiyun elif buf[0]=='{': 261*4882a593Smuzhiyun result[0] = '${' 262*4882a593Smuzhiyun buf[:1] = [] 263*4882a593Smuzhiyun else: 264*4882a593Smuzhiyun if buf[0] in self.SPECIAL_CHARSET: 265*4882a593Smuzhiyun result[-1] = buf[0] 266*4882a593Smuzhiyun read = 1 267*4882a593Smuzhiyun else: 268*4882a593Smuzhiyun for read,c in enumerate(buf): 269*4882a593Smuzhiyun if c not in self.NAME_CHARSET: 270*4882a593Smuzhiyun break 271*4882a593Smuzhiyun else: 272*4882a593Smuzhiyun if not eof: 273*4882a593Smuzhiyun raise NeedMore() 274*4882a593Smuzhiyun read += 1 275*4882a593Smuzhiyun 276*4882a593Smuzhiyun result[-1] += ''.join(buf[0:read]) 277*4882a593Smuzhiyun 278*4882a593Smuzhiyun if not result[-1]: 279*4882a593Smuzhiyun result[:] = ['', result[0], ''] 280*4882a593Smuzhiyun else: 281*4882a593Smuzhiyun result += [''] 282*4882a593Smuzhiyun return read,True 283*4882a593Smuzhiyun 284*4882a593Smuzhiyun sep = result[0] 285*4882a593Smuzhiyun if sep=='$(': 286*4882a593Smuzhiyun parsefunc = self._parse_command 287*4882a593Smuzhiyun elif sep=='${': 288*4882a593Smuzhiyun parsefunc = self._parse_parameter 289*4882a593Smuzhiyun else: 290*4882a593Smuzhiyun raise NotImplementedError(sep) 291*4882a593Smuzhiyun 292*4882a593Smuzhiyun pos, closed = parsefunc(buf, result, eof) 293*4882a593Smuzhiyun return pos, closed 294*4882a593Smuzhiyun 295*4882a593Smuzhiyun def _parse(self, eof): 296*4882a593Smuzhiyun buf = self._buffer 297*4882a593Smuzhiyun stack = self._stack 298*4882a593Smuzhiyun recurse = False 299*4882a593Smuzhiyun 300*4882a593Smuzhiyun while 1: 301*4882a593Smuzhiyun if not stack or recurse: 302*4882a593Smuzhiyun if not buf: 303*4882a593Smuzhiyun raise NeedMore() 304*4882a593Smuzhiyun if buf[0] not in ('"\\`$\''): 305*4882a593Smuzhiyun raise ShellSyntaxError('Invalid quoted string sequence') 306*4882a593Smuzhiyun stack.append([buf[0], '']) 307*4882a593Smuzhiyun buf[:1] = [] 308*4882a593Smuzhiyun recurse = False 309*4882a593Smuzhiyun 310*4882a593Smuzhiyun result = stack[-1] 311*4882a593Smuzhiyun if result[0]=="'": 312*4882a593Smuzhiyun parsefunc = self._parse_squote 313*4882a593Smuzhiyun elif result[0]=='\\': 314*4882a593Smuzhiyun parsefunc = self._parse_bquote 315*4882a593Smuzhiyun elif result[0]=='"': 316*4882a593Smuzhiyun parsefunc = self._parse_dquote 317*4882a593Smuzhiyun elif result[0]=='`': 318*4882a593Smuzhiyun parsefunc = self._parse_command 319*4882a593Smuzhiyun elif result[0][0]=='$': 320*4882a593Smuzhiyun parsefunc = self._parse_dollar 321*4882a593Smuzhiyun else: 322*4882a593Smuzhiyun raise NotImplementedError() 323*4882a593Smuzhiyun 324*4882a593Smuzhiyun read, closed = parsefunc(buf, result, eof) 325*4882a593Smuzhiyun 326*4882a593Smuzhiyun buf[:read] = [] 327*4882a593Smuzhiyun if closed: 328*4882a593Smuzhiyun if len(stack)>1: 329*4882a593Smuzhiyun #Merge in parent expression 330*4882a593Smuzhiyun parsed = stack.pop() 331*4882a593Smuzhiyun stack[-1] += [parsed] 332*4882a593Smuzhiyun stack[-1] += [''] 333*4882a593Smuzhiyun else: 334*4882a593Smuzhiyun break 335*4882a593Smuzhiyun else: 336*4882a593Smuzhiyun recurse = True 337*4882a593Smuzhiyun 338*4882a593Smuzhiyundef normalize_wordtree(wtree): 339*4882a593Smuzhiyun """Fold back every literal sequence (delimited with empty strings) into 340*4882a593Smuzhiyun parent sequence. 341*4882a593Smuzhiyun """ 342*4882a593Smuzhiyun def normalize(wtree): 343*4882a593Smuzhiyun result = [] 344*4882a593Smuzhiyun for part in wtree[1:-1]: 345*4882a593Smuzhiyun if isinstance(part, list): 346*4882a593Smuzhiyun part = normalize(part) 347*4882a593Smuzhiyun if part[0]=='': 348*4882a593Smuzhiyun #Move the part content back at current level 349*4882a593Smuzhiyun result += part[1:-1] 350*4882a593Smuzhiyun continue 351*4882a593Smuzhiyun elif not part: 352*4882a593Smuzhiyun #Remove empty strings 353*4882a593Smuzhiyun continue 354*4882a593Smuzhiyun result.append(part) 355*4882a593Smuzhiyun if not result: 356*4882a593Smuzhiyun result = [''] 357*4882a593Smuzhiyun return [wtree[0]] + result + [wtree[-1]] 358*4882a593Smuzhiyun 359*4882a593Smuzhiyun return normalize(wtree) 360*4882a593Smuzhiyun 361*4882a593Smuzhiyun 362*4882a593Smuzhiyundef make_wordtree(token, here_document=False): 363*4882a593Smuzhiyun """Parse a delimited token and return a tree similar to the ones returned by 364*4882a593Smuzhiyun WordLexer. token may contain any combinations of expansion/quoted fields and 365*4882a593Smuzhiyun non-ones. 366*4882a593Smuzhiyun """ 367*4882a593Smuzhiyun tree = [''] 368*4882a593Smuzhiyun remaining = token 369*4882a593Smuzhiyun delimiters = '\\$`' 370*4882a593Smuzhiyun if not here_document: 371*4882a593Smuzhiyun delimiters += '\'"' 372*4882a593Smuzhiyun 373*4882a593Smuzhiyun while 1: 374*4882a593Smuzhiyun pos, sep = find_chars(remaining, delimiters) 375*4882a593Smuzhiyun if pos==-1: 376*4882a593Smuzhiyun tree += [remaining, ''] 377*4882a593Smuzhiyun return normalize_wordtree(tree) 378*4882a593Smuzhiyun tree.append(remaining[:pos]) 379*4882a593Smuzhiyun remaining = remaining[pos:] 380*4882a593Smuzhiyun 381*4882a593Smuzhiyun try: 382*4882a593Smuzhiyun result, remaining = WordLexer(heredoc = here_document).add(remaining, True) 383*4882a593Smuzhiyun except NeedMore: 384*4882a593Smuzhiyun raise ShellSyntaxError('Invalid token "%s"') 385*4882a593Smuzhiyun tree.append(result) 386*4882a593Smuzhiyun 387*4882a593Smuzhiyun 388*4882a593Smuzhiyundef wordtree_as_string(wtree): 389*4882a593Smuzhiyun """Rewrite an expression tree generated by make_wordtree as string.""" 390*4882a593Smuzhiyun def visit(node, output): 391*4882a593Smuzhiyun for child in node: 392*4882a593Smuzhiyun if isinstance(child, list): 393*4882a593Smuzhiyun visit(child, output) 394*4882a593Smuzhiyun else: 395*4882a593Smuzhiyun output.append(child) 396*4882a593Smuzhiyun 397*4882a593Smuzhiyun output = [] 398*4882a593Smuzhiyun visit(wtree, output) 399*4882a593Smuzhiyun return ''.join(output) 400*4882a593Smuzhiyun 401*4882a593Smuzhiyun 402*4882a593Smuzhiyundef unquote_wordtree(wtree): 403*4882a593Smuzhiyun """Fold the word tree while removing quotes everywhere. Other expansion 404*4882a593Smuzhiyun sequences are joined as such. 405*4882a593Smuzhiyun """ 406*4882a593Smuzhiyun def unquote(wtree): 407*4882a593Smuzhiyun unquoted = [] 408*4882a593Smuzhiyun if wtree[0] in ('', "'", '"', '\\'): 409*4882a593Smuzhiyun wtree = wtree[1:-1] 410*4882a593Smuzhiyun 411*4882a593Smuzhiyun for part in wtree: 412*4882a593Smuzhiyun if isinstance(part, list): 413*4882a593Smuzhiyun part = unquote(part) 414*4882a593Smuzhiyun unquoted.append(part) 415*4882a593Smuzhiyun return ''.join(unquoted) 416*4882a593Smuzhiyun 417*4882a593Smuzhiyun return unquote(wtree) 418*4882a593Smuzhiyun 419*4882a593Smuzhiyun 420*4882a593Smuzhiyunclass HereDocLexer: 421*4882a593Smuzhiyun """HereDocLexer delimits whatever comes from the here-document starting newline 422*4882a593Smuzhiyun not included to the closing delimiter line included. 423*4882a593Smuzhiyun """ 424*4882a593Smuzhiyun def __init__(self, op, delim): 425*4882a593Smuzhiyun assert op in ('<<', '<<-') 426*4882a593Smuzhiyun if not delim: 427*4882a593Smuzhiyun raise ShellSyntaxError('invalid here document delimiter %s' % str(delim)) 428*4882a593Smuzhiyun 429*4882a593Smuzhiyun self._op = op 430*4882a593Smuzhiyun self._delim = delim 431*4882a593Smuzhiyun self._buffer = [] 432*4882a593Smuzhiyun self._token = [] 433*4882a593Smuzhiyun 434*4882a593Smuzhiyun def add(self, data, eof): 435*4882a593Smuzhiyun """If the here-document was delimited, return a tuple (content, remaining). 436*4882a593Smuzhiyun Raise NeedMore() otherwise. 437*4882a593Smuzhiyun """ 438*4882a593Smuzhiyun self._buffer += list(data) 439*4882a593Smuzhiyun self._parse(eof) 440*4882a593Smuzhiyun token = ''.join(self._token) 441*4882a593Smuzhiyun remaining = ''.join(self._buffer) 442*4882a593Smuzhiyun self._token, self._remaining = [], [] 443*4882a593Smuzhiyun return token, remaining 444*4882a593Smuzhiyun 445*4882a593Smuzhiyun def _parse(self, eof): 446*4882a593Smuzhiyun while 1: 447*4882a593Smuzhiyun #Look for first unescaped newline. Quotes may be ignored 448*4882a593Smuzhiyun escaped = False 449*4882a593Smuzhiyun for i,c in enumerate(self._buffer): 450*4882a593Smuzhiyun if escaped: 451*4882a593Smuzhiyun escaped = False 452*4882a593Smuzhiyun elif c=='\\': 453*4882a593Smuzhiyun escaped = True 454*4882a593Smuzhiyun elif c=='\n': 455*4882a593Smuzhiyun break 456*4882a593Smuzhiyun else: 457*4882a593Smuzhiyun i = -1 458*4882a593Smuzhiyun 459*4882a593Smuzhiyun if i==-1 or self._buffer[i]!='\n': 460*4882a593Smuzhiyun if not eof: 461*4882a593Smuzhiyun raise NeedMore() 462*4882a593Smuzhiyun #No more data, maybe the last line is closing delimiter 463*4882a593Smuzhiyun line = ''.join(self._buffer) 464*4882a593Smuzhiyun eol = '' 465*4882a593Smuzhiyun self._buffer[:] = [] 466*4882a593Smuzhiyun else: 467*4882a593Smuzhiyun line = ''.join(self._buffer[:i]) 468*4882a593Smuzhiyun eol = self._buffer[i] 469*4882a593Smuzhiyun self._buffer[:i+1] = [] 470*4882a593Smuzhiyun 471*4882a593Smuzhiyun if self._op=='<<-': 472*4882a593Smuzhiyun line = line.lstrip('\t') 473*4882a593Smuzhiyun 474*4882a593Smuzhiyun if line==self._delim: 475*4882a593Smuzhiyun break 476*4882a593Smuzhiyun 477*4882a593Smuzhiyun self._token += [line, eol] 478*4882a593Smuzhiyun if i==-1: 479*4882a593Smuzhiyun break 480*4882a593Smuzhiyun 481*4882a593Smuzhiyunclass Token: 482*4882a593Smuzhiyun #TODO: check this is still in use 483*4882a593Smuzhiyun OPERATOR = 'OPERATOR' 484*4882a593Smuzhiyun WORD = 'WORD' 485*4882a593Smuzhiyun 486*4882a593Smuzhiyun def __init__(self): 487*4882a593Smuzhiyun self.value = '' 488*4882a593Smuzhiyun self.type = None 489*4882a593Smuzhiyun 490*4882a593Smuzhiyun def __getitem__(self, key): 491*4882a593Smuzhiyun #Behave like a two elements tuple 492*4882a593Smuzhiyun if key==0: 493*4882a593Smuzhiyun return self.type 494*4882a593Smuzhiyun if key==1: 495*4882a593Smuzhiyun return self.value 496*4882a593Smuzhiyun raise IndexError(key) 497*4882a593Smuzhiyun 498*4882a593Smuzhiyun 499*4882a593Smuzhiyunclass HereDoc: 500*4882a593Smuzhiyun def __init__(self, op, name=None): 501*4882a593Smuzhiyun self.op = op 502*4882a593Smuzhiyun self.name = name 503*4882a593Smuzhiyun self.pendings = [] 504*4882a593Smuzhiyun 505*4882a593SmuzhiyunTK_COMMA = 'COMMA' 506*4882a593SmuzhiyunTK_AMPERSAND = 'AMP' 507*4882a593SmuzhiyunTK_OP = 'OP' 508*4882a593SmuzhiyunTK_TOKEN = 'TOKEN' 509*4882a593SmuzhiyunTK_COMMENT = 'COMMENT' 510*4882a593SmuzhiyunTK_NEWLINE = 'NEWLINE' 511*4882a593SmuzhiyunTK_IONUMBER = 'IO_NUMBER' 512*4882a593SmuzhiyunTK_ASSIGNMENT = 'ASSIGNMENT_WORD' 513*4882a593SmuzhiyunTK_HERENAME = 'HERENAME' 514*4882a593Smuzhiyun 515*4882a593Smuzhiyunclass Lexer: 516*4882a593Smuzhiyun """Main lexer. 517*4882a593Smuzhiyun 518*4882a593Smuzhiyun Call add() until the script AST is returned. 519*4882a593Smuzhiyun """ 520*4882a593Smuzhiyun # Here-document handling makes the whole thing more complex because they basically 521*4882a593Smuzhiyun # force tokens to be reordered: here-content must come right after the operator 522*4882a593Smuzhiyun # and the here-document name, while some other tokens might be following the 523*4882a593Smuzhiyun # here-document expression on the same line. 524*4882a593Smuzhiyun # 525*4882a593Smuzhiyun # So, here-doc states are basically: 526*4882a593Smuzhiyun # *self._state==ST_NORMAL 527*4882a593Smuzhiyun # - self._heredoc.op is None: no here-document 528*4882a593Smuzhiyun # - self._heredoc.op is not None but name is: here-document operator matched, 529*4882a593Smuzhiyun # waiting for the document name/delimiter 530*4882a593Smuzhiyun # - self._heredoc.op and name are not None: here-document is ready, following 531*4882a593Smuzhiyun # tokens are being stored and will be pushed again when the document is 532*4882a593Smuzhiyun # completely parsed. 533*4882a593Smuzhiyun # *self._state==ST_HEREDOC 534*4882a593Smuzhiyun # - The here-document is being delimited by self._herelexer. Once it is done 535*4882a593Smuzhiyun # the content is pushed in front of the pending token list then all these 536*4882a593Smuzhiyun # tokens are pushed once again. 537*4882a593Smuzhiyun ST_NORMAL = 'ST_NORMAL' 538*4882a593Smuzhiyun ST_OP = 'ST_OP' 539*4882a593Smuzhiyun ST_BACKSLASH = 'ST_BACKSLASH' 540*4882a593Smuzhiyun ST_QUOTED = 'ST_QUOTED' 541*4882a593Smuzhiyun ST_COMMENT = 'ST_COMMENT' 542*4882a593Smuzhiyun ST_HEREDOC = 'ST_HEREDOC' 543*4882a593Smuzhiyun 544*4882a593Smuzhiyun #Match end of backquote strings 545*4882a593Smuzhiyun RE_BACKQUOTE_END = re.compile(r'(?<!\\)(`)') 546*4882a593Smuzhiyun 547*4882a593Smuzhiyun def __init__(self, parent_state = None): 548*4882a593Smuzhiyun self._input = [] 549*4882a593Smuzhiyun self._pos = 0 550*4882a593Smuzhiyun 551*4882a593Smuzhiyun self._token = '' 552*4882a593Smuzhiyun self._type = TK_TOKEN 553*4882a593Smuzhiyun 554*4882a593Smuzhiyun self._state = self.ST_NORMAL 555*4882a593Smuzhiyun self._parent_state = parent_state 556*4882a593Smuzhiyun self._wordlexer = None 557*4882a593Smuzhiyun 558*4882a593Smuzhiyun self._heredoc = HereDoc(None) 559*4882a593Smuzhiyun self._herelexer = None 560*4882a593Smuzhiyun 561*4882a593Smuzhiyun ### Following attributes are not used for delimiting token and can safely 562*4882a593Smuzhiyun ### be changed after here-document detection (see _push_toke) 563*4882a593Smuzhiyun 564*4882a593Smuzhiyun # Count the number of tokens following a 'For' reserved word. Needed to 565*4882a593Smuzhiyun # return an 'In' reserved word if it comes in third place. 566*4882a593Smuzhiyun self._for_count = None 567*4882a593Smuzhiyun 568*4882a593Smuzhiyun def add(self, data, eof=False): 569*4882a593Smuzhiyun """Feed the lexer with data. 570*4882a593Smuzhiyun 571*4882a593Smuzhiyun When eof is set to True, returns unconsumed data or raise if the lexer 572*4882a593Smuzhiyun is in the middle of a delimiting operation. 573*4882a593Smuzhiyun Raise NeedMore otherwise. 574*4882a593Smuzhiyun """ 575*4882a593Smuzhiyun self._input += list(data) 576*4882a593Smuzhiyun self._parse(eof) 577*4882a593Smuzhiyun self._input[:self._pos] = [] 578*4882a593Smuzhiyun return ''.join(self._input) 579*4882a593Smuzhiyun 580*4882a593Smuzhiyun def _parse(self, eof): 581*4882a593Smuzhiyun while self._state: 582*4882a593Smuzhiyun if self._pos>=len(self._input): 583*4882a593Smuzhiyun if not eof: 584*4882a593Smuzhiyun raise NeedMore() 585*4882a593Smuzhiyun elif self._state not in (self.ST_OP, self.ST_QUOTED, self.ST_HEREDOC): 586*4882a593Smuzhiyun #Delimit the current token and leave cleanly 587*4882a593Smuzhiyun self._push_token('') 588*4882a593Smuzhiyun break 589*4882a593Smuzhiyun else: 590*4882a593Smuzhiyun #Let the sublexer handle the eof themselves 591*4882a593Smuzhiyun pass 592*4882a593Smuzhiyun 593*4882a593Smuzhiyun if self._state==self.ST_NORMAL: 594*4882a593Smuzhiyun self._parse_normal() 595*4882a593Smuzhiyun elif self._state==self.ST_COMMENT: 596*4882a593Smuzhiyun self._parse_comment() 597*4882a593Smuzhiyun elif self._state==self.ST_OP: 598*4882a593Smuzhiyun self._parse_op(eof) 599*4882a593Smuzhiyun elif self._state==self.ST_QUOTED: 600*4882a593Smuzhiyun self._parse_quoted(eof) 601*4882a593Smuzhiyun elif self._state==self.ST_HEREDOC: 602*4882a593Smuzhiyun self._parse_heredoc(eof) 603*4882a593Smuzhiyun else: 604*4882a593Smuzhiyun assert False, "Unknown state " + str(self._state) 605*4882a593Smuzhiyun 606*4882a593Smuzhiyun if self._heredoc.op is not None: 607*4882a593Smuzhiyun raise ShellSyntaxError('missing here-document delimiter') 608*4882a593Smuzhiyun 609*4882a593Smuzhiyun def _parse_normal(self): 610*4882a593Smuzhiyun c = self._input[self._pos] 611*4882a593Smuzhiyun if c=='\n': 612*4882a593Smuzhiyun self._push_token(c) 613*4882a593Smuzhiyun self._token = c 614*4882a593Smuzhiyun self._type = TK_NEWLINE 615*4882a593Smuzhiyun self._push_token('') 616*4882a593Smuzhiyun self._pos += 1 617*4882a593Smuzhiyun elif c in ('\\', '\'', '"', '`', '$'): 618*4882a593Smuzhiyun self._state = self.ST_QUOTED 619*4882a593Smuzhiyun elif is_partial_op(c): 620*4882a593Smuzhiyun self._push_token(c) 621*4882a593Smuzhiyun 622*4882a593Smuzhiyun self._type = TK_OP 623*4882a593Smuzhiyun self._token += c 624*4882a593Smuzhiyun self._pos += 1 625*4882a593Smuzhiyun self._state = self.ST_OP 626*4882a593Smuzhiyun elif is_blank(c): 627*4882a593Smuzhiyun self._push_token(c) 628*4882a593Smuzhiyun 629*4882a593Smuzhiyun #Discard blanks 630*4882a593Smuzhiyun self._pos += 1 631*4882a593Smuzhiyun elif self._token: 632*4882a593Smuzhiyun self._token += c 633*4882a593Smuzhiyun self._pos += 1 634*4882a593Smuzhiyun elif c=='#': 635*4882a593Smuzhiyun self._state = self.ST_COMMENT 636*4882a593Smuzhiyun self._type = TK_COMMENT 637*4882a593Smuzhiyun self._pos += 1 638*4882a593Smuzhiyun else: 639*4882a593Smuzhiyun self._pos += 1 640*4882a593Smuzhiyun self._token += c 641*4882a593Smuzhiyun 642*4882a593Smuzhiyun def _parse_op(self, eof): 643*4882a593Smuzhiyun assert self._token 644*4882a593Smuzhiyun 645*4882a593Smuzhiyun while 1: 646*4882a593Smuzhiyun if self._pos>=len(self._input): 647*4882a593Smuzhiyun if not eof: 648*4882a593Smuzhiyun raise NeedMore() 649*4882a593Smuzhiyun c = '' 650*4882a593Smuzhiyun else: 651*4882a593Smuzhiyun c = self._input[self._pos] 652*4882a593Smuzhiyun 653*4882a593Smuzhiyun op = self._token + c 654*4882a593Smuzhiyun if c and is_partial_op(op): 655*4882a593Smuzhiyun #Still parsing an operator 656*4882a593Smuzhiyun self._token = op 657*4882a593Smuzhiyun self._pos += 1 658*4882a593Smuzhiyun else: 659*4882a593Smuzhiyun #End of operator 660*4882a593Smuzhiyun self._push_token(c) 661*4882a593Smuzhiyun self._state = self.ST_NORMAL 662*4882a593Smuzhiyun break 663*4882a593Smuzhiyun 664*4882a593Smuzhiyun def _parse_comment(self): 665*4882a593Smuzhiyun while 1: 666*4882a593Smuzhiyun if self._pos>=len(self._input): 667*4882a593Smuzhiyun raise NeedMore() 668*4882a593Smuzhiyun 669*4882a593Smuzhiyun c = self._input[self._pos] 670*4882a593Smuzhiyun if c=='\n': 671*4882a593Smuzhiyun #End of comment, do not consume the end of line 672*4882a593Smuzhiyun self._state = self.ST_NORMAL 673*4882a593Smuzhiyun break 674*4882a593Smuzhiyun else: 675*4882a593Smuzhiyun self._token += c 676*4882a593Smuzhiyun self._pos += 1 677*4882a593Smuzhiyun 678*4882a593Smuzhiyun def _parse_quoted(self, eof): 679*4882a593Smuzhiyun """Precondition: the starting backquote/dollar is still in the input queue.""" 680*4882a593Smuzhiyun if not self._wordlexer: 681*4882a593Smuzhiyun self._wordlexer = WordLexer() 682*4882a593Smuzhiyun 683*4882a593Smuzhiyun if self._pos<len(self._input): 684*4882a593Smuzhiyun #Transfer input queue character into the subparser 685*4882a593Smuzhiyun input = self._input[self._pos:] 686*4882a593Smuzhiyun self._pos += len(input) 687*4882a593Smuzhiyun 688*4882a593Smuzhiyun wtree, remaining = self._wordlexer.add(input, eof) 689*4882a593Smuzhiyun self._wordlexer = None 690*4882a593Smuzhiyun self._token += wordtree_as_string(wtree) 691*4882a593Smuzhiyun 692*4882a593Smuzhiyun #Put unparsed character back in the input queue 693*4882a593Smuzhiyun if remaining: 694*4882a593Smuzhiyun self._input[self._pos:self._pos] = list(remaining) 695*4882a593Smuzhiyun self._state = self.ST_NORMAL 696*4882a593Smuzhiyun 697*4882a593Smuzhiyun def _parse_heredoc(self, eof): 698*4882a593Smuzhiyun assert not self._token 699*4882a593Smuzhiyun 700*4882a593Smuzhiyun if self._herelexer is None: 701*4882a593Smuzhiyun self._herelexer = HereDocLexer(self._heredoc.op, self._heredoc.name) 702*4882a593Smuzhiyun 703*4882a593Smuzhiyun if self._pos<len(self._input): 704*4882a593Smuzhiyun #Transfer input queue character into the subparser 705*4882a593Smuzhiyun input = self._input[self._pos:] 706*4882a593Smuzhiyun self._pos += len(input) 707*4882a593Smuzhiyun 708*4882a593Smuzhiyun self._token, remaining = self._herelexer.add(input, eof) 709*4882a593Smuzhiyun 710*4882a593Smuzhiyun #Reset here-document state 711*4882a593Smuzhiyun self._herelexer = None 712*4882a593Smuzhiyun heredoc, self._heredoc = self._heredoc, HereDoc(None) 713*4882a593Smuzhiyun if remaining: 714*4882a593Smuzhiyun self._input[self._pos:self._pos] = list(remaining) 715*4882a593Smuzhiyun self._state = self.ST_NORMAL 716*4882a593Smuzhiyun 717*4882a593Smuzhiyun #Push pending tokens 718*4882a593Smuzhiyun heredoc.pendings[:0] = [(self._token, self._type, heredoc.name)] 719*4882a593Smuzhiyun for token, type, delim in heredoc.pendings: 720*4882a593Smuzhiyun self._token = token 721*4882a593Smuzhiyun self._type = type 722*4882a593Smuzhiyun self._push_token(delim) 723*4882a593Smuzhiyun 724*4882a593Smuzhiyun def _push_token(self, delim): 725*4882a593Smuzhiyun if not self._token: 726*4882a593Smuzhiyun return 0 727*4882a593Smuzhiyun 728*4882a593Smuzhiyun if self._heredoc.op is not None: 729*4882a593Smuzhiyun if self._heredoc.name is None: 730*4882a593Smuzhiyun #Here-document name 731*4882a593Smuzhiyun if self._type!=TK_TOKEN: 732*4882a593Smuzhiyun raise ShellSyntaxError("expecting here-document name, got '%s'" % self._token) 733*4882a593Smuzhiyun self._heredoc.name = unquote_wordtree(make_wordtree(self._token)) 734*4882a593Smuzhiyun self._type = TK_HERENAME 735*4882a593Smuzhiyun else: 736*4882a593Smuzhiyun #Capture all tokens until the newline starting the here-document 737*4882a593Smuzhiyun if self._type==TK_NEWLINE: 738*4882a593Smuzhiyun assert self._state==self.ST_NORMAL 739*4882a593Smuzhiyun self._state = self.ST_HEREDOC 740*4882a593Smuzhiyun 741*4882a593Smuzhiyun self._heredoc.pendings.append((self._token, self._type, delim)) 742*4882a593Smuzhiyun self._token = '' 743*4882a593Smuzhiyun self._type = TK_TOKEN 744*4882a593Smuzhiyun return 1 745*4882a593Smuzhiyun 746*4882a593Smuzhiyun # BEWARE: do not change parser state from here to the end of the function: 747*4882a593Smuzhiyun # when parsing between an here-document operator to the end of the line 748*4882a593Smuzhiyun # tokens are stored in self._heredoc.pendings. Therefore, they will not 749*4882a593Smuzhiyun # reach the section below. 750*4882a593Smuzhiyun 751*4882a593Smuzhiyun #Check operators 752*4882a593Smuzhiyun if self._type==TK_OP: 753*4882a593Smuzhiyun #False positive because of partial op matching 754*4882a593Smuzhiyun op = is_op(self._token) 755*4882a593Smuzhiyun if not op: 756*4882a593Smuzhiyun self._type = TK_TOKEN 757*4882a593Smuzhiyun else: 758*4882a593Smuzhiyun #Map to the specific operator 759*4882a593Smuzhiyun self._type = op 760*4882a593Smuzhiyun if self._token in ('<<', '<<-'): 761*4882a593Smuzhiyun #Done here rather than in _parse_op because there is no need 762*4882a593Smuzhiyun #to change the parser state since we are still waiting for 763*4882a593Smuzhiyun #the here-document name 764*4882a593Smuzhiyun if self._heredoc.op is not None: 765*4882a593Smuzhiyun raise ShellSyntaxError("syntax error near token '%s'" % self._token) 766*4882a593Smuzhiyun assert self._heredoc.op is None 767*4882a593Smuzhiyun self._heredoc.op = self._token 768*4882a593Smuzhiyun 769*4882a593Smuzhiyun if self._type==TK_TOKEN: 770*4882a593Smuzhiyun if '=' in self._token and not delim: 771*4882a593Smuzhiyun if self._token.startswith('='): 772*4882a593Smuzhiyun #Token is a WORD... a TOKEN that is. 773*4882a593Smuzhiyun pass 774*4882a593Smuzhiyun else: 775*4882a593Smuzhiyun prev = self._token[:self._token.find('=')] 776*4882a593Smuzhiyun if is_name(prev): 777*4882a593Smuzhiyun self._type = TK_ASSIGNMENT 778*4882a593Smuzhiyun else: 779*4882a593Smuzhiyun #Just a token (unspecified) 780*4882a593Smuzhiyun pass 781*4882a593Smuzhiyun else: 782*4882a593Smuzhiyun reserved = get_reserved(self._token) 783*4882a593Smuzhiyun if reserved is not None: 784*4882a593Smuzhiyun if reserved=='In' and self._for_count!=2: 785*4882a593Smuzhiyun #Sorry, not a reserved word after all 786*4882a593Smuzhiyun pass 787*4882a593Smuzhiyun else: 788*4882a593Smuzhiyun self._type = reserved 789*4882a593Smuzhiyun if reserved in ('For', 'Case'): 790*4882a593Smuzhiyun self._for_count = 0 791*4882a593Smuzhiyun elif are_digits(self._token) and delim in ('<', '>'): 792*4882a593Smuzhiyun #Detect IO_NUMBER 793*4882a593Smuzhiyun self._type = TK_IONUMBER 794*4882a593Smuzhiyun elif self._token==';': 795*4882a593Smuzhiyun self._type = TK_COMMA 796*4882a593Smuzhiyun elif self._token=='&': 797*4882a593Smuzhiyun self._type = TK_AMPERSAND 798*4882a593Smuzhiyun elif self._type==TK_COMMENT: 799*4882a593Smuzhiyun #Comments are not part of sh grammar, ignore them 800*4882a593Smuzhiyun self._token = '' 801*4882a593Smuzhiyun self._type = TK_TOKEN 802*4882a593Smuzhiyun return 0 803*4882a593Smuzhiyun 804*4882a593Smuzhiyun if self._for_count is not None: 805*4882a593Smuzhiyun #Track token count in 'For' expression to detect 'In' reserved words. 806*4882a593Smuzhiyun #Can only be in third position, no need to go beyond 807*4882a593Smuzhiyun self._for_count += 1 808*4882a593Smuzhiyun if self._for_count==3: 809*4882a593Smuzhiyun self._for_count = None 810*4882a593Smuzhiyun 811*4882a593Smuzhiyun self.on_token((self._token, self._type)) 812*4882a593Smuzhiyun self._token = '' 813*4882a593Smuzhiyun self._type = TK_TOKEN 814*4882a593Smuzhiyun return 1 815*4882a593Smuzhiyun 816*4882a593Smuzhiyun def on_token(self, token): 817*4882a593Smuzhiyun raise NotImplementedError 818*4882a593Smuzhiyun 819*4882a593Smuzhiyun 820*4882a593Smuzhiyuntokens = [ 821*4882a593Smuzhiyun TK_TOKEN, 822*4882a593Smuzhiyun# To silence yacc unused token warnings 823*4882a593Smuzhiyun# TK_COMMENT, 824*4882a593Smuzhiyun TK_NEWLINE, 825*4882a593Smuzhiyun TK_IONUMBER, 826*4882a593Smuzhiyun TK_ASSIGNMENT, 827*4882a593Smuzhiyun TK_HERENAME, 828*4882a593Smuzhiyun] 829*4882a593Smuzhiyun 830*4882a593Smuzhiyun#Add specific operators 831*4882a593Smuzhiyuntokens += _OPERATORS.values() 832*4882a593Smuzhiyun#Add reserved words 833*4882a593Smuzhiyuntokens += _RESERVEDS.values() 834*4882a593Smuzhiyun 835*4882a593Smuzhiyunclass PLYLexer(Lexer): 836*4882a593Smuzhiyun """Bridge Lexer and PLY lexer interface.""" 837*4882a593Smuzhiyun def __init__(self): 838*4882a593Smuzhiyun Lexer.__init__(self) 839*4882a593Smuzhiyun self._tokens = [] 840*4882a593Smuzhiyun self._current = 0 841*4882a593Smuzhiyun self.lineno = 0 842*4882a593Smuzhiyun 843*4882a593Smuzhiyun def on_token(self, token): 844*4882a593Smuzhiyun value, type = token 845*4882a593Smuzhiyun 846*4882a593Smuzhiyun self.lineno = 0 847*4882a593Smuzhiyun t = lex.LexToken() 848*4882a593Smuzhiyun t.value = value 849*4882a593Smuzhiyun t.type = type 850*4882a593Smuzhiyun t.lexer = self 851*4882a593Smuzhiyun t.lexpos = 0 852*4882a593Smuzhiyun t.lineno = 0 853*4882a593Smuzhiyun 854*4882a593Smuzhiyun self._tokens.append(t) 855*4882a593Smuzhiyun 856*4882a593Smuzhiyun def is_empty(self): 857*4882a593Smuzhiyun return not bool(self._tokens) 858*4882a593Smuzhiyun 859*4882a593Smuzhiyun #PLY compliant interface 860*4882a593Smuzhiyun def token(self): 861*4882a593Smuzhiyun if self._current>=len(self._tokens): 862*4882a593Smuzhiyun return None 863*4882a593Smuzhiyun t = self._tokens[self._current] 864*4882a593Smuzhiyun self._current += 1 865*4882a593Smuzhiyun return t 866*4882a593Smuzhiyun 867*4882a593Smuzhiyun 868*4882a593Smuzhiyundef get_tokens(s): 869*4882a593Smuzhiyun """Parse the input string and return a tuple (tokens, unprocessed) where 870*4882a593Smuzhiyun tokens is a list of parsed tokens and unprocessed is the part of the input 871*4882a593Smuzhiyun string left untouched by the lexer. 872*4882a593Smuzhiyun """ 873*4882a593Smuzhiyun lexer = PLYLexer() 874*4882a593Smuzhiyun untouched = lexer.add(s, True) 875*4882a593Smuzhiyun tokens = [] 876*4882a593Smuzhiyun while 1: 877*4882a593Smuzhiyun token = lexer.token() 878*4882a593Smuzhiyun if token is None: 879*4882a593Smuzhiyun break 880*4882a593Smuzhiyun tokens.append(token) 881*4882a593Smuzhiyun 882*4882a593Smuzhiyun tokens = [(t.value, t.type) for t in tokens] 883*4882a593Smuzhiyun return tokens, untouched 884