1*4882a593Smuzhiyun# ----------------------------------------------------------------------------- 2*4882a593Smuzhiyun# ply: lex.py 3*4882a593Smuzhiyun# 4*4882a593Smuzhiyun# Copyright (C) 2001-2009, 5*4882a593Smuzhiyun# David M. Beazley (Dabeaz LLC) 6*4882a593Smuzhiyun# All rights reserved. 7*4882a593Smuzhiyun# 8*4882a593Smuzhiyun# Redistribution and use in source and binary forms, with or without 9*4882a593Smuzhiyun# modification, are permitted provided that the following conditions are 10*4882a593Smuzhiyun# met: 11*4882a593Smuzhiyun# 12*4882a593Smuzhiyun# * Redistributions of source code must retain the above copyright notice, 13*4882a593Smuzhiyun# this list of conditions and the following disclaimer. 14*4882a593Smuzhiyun# * Redistributions in binary form must reproduce the above copyright notice, 15*4882a593Smuzhiyun# this list of conditions and the following disclaimer in the documentation 16*4882a593Smuzhiyun# and/or other materials provided with the distribution. 17*4882a593Smuzhiyun# * Neither the name of the David Beazley or Dabeaz LLC may be used to 18*4882a593Smuzhiyun# endorse or promote products derived from this software without 19*4882a593Smuzhiyun# specific prior written permission. 20*4882a593Smuzhiyun# 21*4882a593Smuzhiyun# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22*4882a593Smuzhiyun# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23*4882a593Smuzhiyun# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24*4882a593Smuzhiyun# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25*4882a593Smuzhiyun# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26*4882a593Smuzhiyun# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27*4882a593Smuzhiyun# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28*4882a593Smuzhiyun# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29*4882a593Smuzhiyun# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30*4882a593Smuzhiyun# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31*4882a593Smuzhiyun# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32*4882a593Smuzhiyun# ----------------------------------------------------------------------------- 33*4882a593Smuzhiyun 34*4882a593Smuzhiyun__version__ = "3.3" 35*4882a593Smuzhiyun__tabversion__ = "3.2" # Version of table file used 36*4882a593Smuzhiyun 37*4882a593Smuzhiyunimport re, sys, types, copy, os 38*4882a593Smuzhiyun 39*4882a593Smuzhiyun# This tuple contains known string types 40*4882a593Smuzhiyuntry: 41*4882a593Smuzhiyun # Python 2.6 42*4882a593Smuzhiyun StringTypes = (types.StringType, types.UnicodeType) 43*4882a593Smuzhiyunexcept AttributeError: 44*4882a593Smuzhiyun # Python 3.0 45*4882a593Smuzhiyun StringTypes = (str, bytes) 46*4882a593Smuzhiyun 47*4882a593Smuzhiyun# Extract the code attribute of a function. Different implementations 48*4882a593Smuzhiyun# are for Python 2/3 compatibility. 49*4882a593Smuzhiyun 50*4882a593Smuzhiyunif sys.version_info[0] < 3: 51*4882a593Smuzhiyun def func_code(f): 52*4882a593Smuzhiyun return f.func_code 53*4882a593Smuzhiyunelse: 54*4882a593Smuzhiyun def func_code(f): 55*4882a593Smuzhiyun return f.__code__ 56*4882a593Smuzhiyun 57*4882a593Smuzhiyun# This regular expression is used to match valid token names 58*4882a593Smuzhiyun_is_identifier = re.compile(r'^[a-zA-Z0-9_]+$') 59*4882a593Smuzhiyun 60*4882a593Smuzhiyun# Exception thrown when invalid token encountered and no default error 61*4882a593Smuzhiyun# handler is defined. 62*4882a593Smuzhiyun 63*4882a593Smuzhiyunclass LexError(Exception): 64*4882a593Smuzhiyun def __init__(self,message,s): 65*4882a593Smuzhiyun self.args = (message,) 66*4882a593Smuzhiyun self.text = s 67*4882a593Smuzhiyun 68*4882a593Smuzhiyun# Token class. This class is used to represent the tokens produced. 69*4882a593Smuzhiyunclass LexToken(object): 70*4882a593Smuzhiyun def __str__(self): 71*4882a593Smuzhiyun return "LexToken(%s,%r,%d,%d)" % (self.type,self.value,self.lineno,self.lexpos) 72*4882a593Smuzhiyun def __repr__(self): 73*4882a593Smuzhiyun return str(self) 74*4882a593Smuzhiyun 75*4882a593Smuzhiyun# This object is a stand-in for a logging object created by the 76*4882a593Smuzhiyun# logging module. 77*4882a593Smuzhiyun 78*4882a593Smuzhiyunclass PlyLogger(object): 79*4882a593Smuzhiyun def __init__(self,f): 80*4882a593Smuzhiyun self.f = f 81*4882a593Smuzhiyun def critical(self,msg,*args,**kwargs): 82*4882a593Smuzhiyun self.f.write((msg % args) + "\n") 83*4882a593Smuzhiyun 84*4882a593Smuzhiyun def warning(self,msg,*args,**kwargs): 85*4882a593Smuzhiyun self.f.write("WARNING: "+ (msg % args) + "\n") 86*4882a593Smuzhiyun 87*4882a593Smuzhiyun def error(self,msg,*args,**kwargs): 88*4882a593Smuzhiyun self.f.write("ERROR: " + (msg % args) + "\n") 89*4882a593Smuzhiyun 90*4882a593Smuzhiyun info = critical 91*4882a593Smuzhiyun debug = critical 92*4882a593Smuzhiyun 93*4882a593Smuzhiyun# Null logger is used when no output is generated. Does nothing. 94*4882a593Smuzhiyunclass NullLogger(object): 95*4882a593Smuzhiyun def __getattribute__(self,name): 96*4882a593Smuzhiyun return self 97*4882a593Smuzhiyun def __call__(self,*args,**kwargs): 98*4882a593Smuzhiyun return self 99*4882a593Smuzhiyun 100*4882a593Smuzhiyun# ----------------------------------------------------------------------------- 101*4882a593Smuzhiyun# === Lexing Engine === 102*4882a593Smuzhiyun# 103*4882a593Smuzhiyun# The following Lexer class implements the lexer runtime. There are only 104*4882a593Smuzhiyun# a few public methods and attributes: 105*4882a593Smuzhiyun# 106*4882a593Smuzhiyun# input() - Store a new string in the lexer 107*4882a593Smuzhiyun# token() - Get the next token 108*4882a593Smuzhiyun# clone() - Clone the lexer 109*4882a593Smuzhiyun# 110*4882a593Smuzhiyun# lineno - Current line number 111*4882a593Smuzhiyun# lexpos - Current position in the input string 112*4882a593Smuzhiyun# ----------------------------------------------------------------------------- 113*4882a593Smuzhiyun 114*4882a593Smuzhiyunclass Lexer: 115*4882a593Smuzhiyun def __init__(self): 116*4882a593Smuzhiyun self.lexre = None # Master regular expression. This is a list of 117*4882a593Smuzhiyun # tuples (re,findex) where re is a compiled 118*4882a593Smuzhiyun # regular expression and findex is a list 119*4882a593Smuzhiyun # mapping regex group numbers to rules 120*4882a593Smuzhiyun self.lexretext = None # Current regular expression strings 121*4882a593Smuzhiyun self.lexstatere = {} # Dictionary mapping lexer states to master regexs 122*4882a593Smuzhiyun self.lexstateretext = {} # Dictionary mapping lexer states to regex strings 123*4882a593Smuzhiyun self.lexstaterenames = {} # Dictionary mapping lexer states to symbol names 124*4882a593Smuzhiyun self.lexstate = "INITIAL" # Current lexer state 125*4882a593Smuzhiyun self.lexstatestack = [] # Stack of lexer states 126*4882a593Smuzhiyun self.lexstateinfo = None # State information 127*4882a593Smuzhiyun self.lexstateignore = {} # Dictionary of ignored characters for each state 128*4882a593Smuzhiyun self.lexstateerrorf = {} # Dictionary of error functions for each state 129*4882a593Smuzhiyun self.lexreflags = 0 # Optional re compile flags 130*4882a593Smuzhiyun self.lexdata = None # Actual input data (as a string) 131*4882a593Smuzhiyun self.lexpos = 0 # Current position in input text 132*4882a593Smuzhiyun self.lexlen = 0 # Length of the input text 133*4882a593Smuzhiyun self.lexerrorf = None # Error rule (if any) 134*4882a593Smuzhiyun self.lextokens = None # List of valid tokens 135*4882a593Smuzhiyun self.lexignore = "" # Ignored characters 136*4882a593Smuzhiyun self.lexliterals = "" # Literal characters that can be passed through 137*4882a593Smuzhiyun self.lexmodule = None # Module 138*4882a593Smuzhiyun self.lineno = 1 # Current line number 139*4882a593Smuzhiyun self.lexoptimize = 0 # Optimized mode 140*4882a593Smuzhiyun 141*4882a593Smuzhiyun def clone(self,object=None): 142*4882a593Smuzhiyun c = copy.copy(self) 143*4882a593Smuzhiyun 144*4882a593Smuzhiyun # If the object parameter has been supplied, it means we are attaching the 145*4882a593Smuzhiyun # lexer to a new object. In this case, we have to rebind all methods in 146*4882a593Smuzhiyun # the lexstatere and lexstateerrorf tables. 147*4882a593Smuzhiyun 148*4882a593Smuzhiyun if object: 149*4882a593Smuzhiyun newtab = { } 150*4882a593Smuzhiyun for key, ritem in self.lexstatere.items(): 151*4882a593Smuzhiyun newre = [] 152*4882a593Smuzhiyun for cre, findex in ritem: 153*4882a593Smuzhiyun newfindex = [] 154*4882a593Smuzhiyun for f in findex: 155*4882a593Smuzhiyun if not f or not f[0]: 156*4882a593Smuzhiyun newfindex.append(f) 157*4882a593Smuzhiyun continue 158*4882a593Smuzhiyun newfindex.append((getattr(object,f[0].__name__),f[1])) 159*4882a593Smuzhiyun newre.append((cre,newfindex)) 160*4882a593Smuzhiyun newtab[key] = newre 161*4882a593Smuzhiyun c.lexstatere = newtab 162*4882a593Smuzhiyun c.lexstateerrorf = { } 163*4882a593Smuzhiyun for key, ef in self.lexstateerrorf.items(): 164*4882a593Smuzhiyun c.lexstateerrorf[key] = getattr(object,ef.__name__) 165*4882a593Smuzhiyun c.lexmodule = object 166*4882a593Smuzhiyun return c 167*4882a593Smuzhiyun 168*4882a593Smuzhiyun # ------------------------------------------------------------ 169*4882a593Smuzhiyun # writetab() - Write lexer information to a table file 170*4882a593Smuzhiyun # ------------------------------------------------------------ 171*4882a593Smuzhiyun def writetab(self,tabfile,outputdir=""): 172*4882a593Smuzhiyun if isinstance(tabfile,types.ModuleType): 173*4882a593Smuzhiyun return 174*4882a593Smuzhiyun basetabfilename = tabfile.split(".")[-1] 175*4882a593Smuzhiyun filename = os.path.join(outputdir,basetabfilename)+".py" 176*4882a593Smuzhiyun tf = open(filename,"w") 177*4882a593Smuzhiyun tf.write("# %s.py. This file automatically created by PLY (version %s). Don't edit!\n" % (tabfile,__version__)) 178*4882a593Smuzhiyun tf.write("_tabversion = %s\n" % repr(__version__)) 179*4882a593Smuzhiyun tf.write("_lextokens = %s\n" % repr(self.lextokens)) 180*4882a593Smuzhiyun tf.write("_lexreflags = %s\n" % repr(self.lexreflags)) 181*4882a593Smuzhiyun tf.write("_lexliterals = %s\n" % repr(self.lexliterals)) 182*4882a593Smuzhiyun tf.write("_lexstateinfo = %s\n" % repr(self.lexstateinfo)) 183*4882a593Smuzhiyun 184*4882a593Smuzhiyun tabre = { } 185*4882a593Smuzhiyun # Collect all functions in the initial state 186*4882a593Smuzhiyun initial = self.lexstatere["INITIAL"] 187*4882a593Smuzhiyun initialfuncs = [] 188*4882a593Smuzhiyun for part in initial: 189*4882a593Smuzhiyun for f in part[1]: 190*4882a593Smuzhiyun if f and f[0]: 191*4882a593Smuzhiyun initialfuncs.append(f) 192*4882a593Smuzhiyun 193*4882a593Smuzhiyun for key, lre in self.lexstatere.items(): 194*4882a593Smuzhiyun titem = [] 195*4882a593Smuzhiyun for i in range(len(lre)): 196*4882a593Smuzhiyun titem.append((self.lexstateretext[key][i],_funcs_to_names(lre[i][1],self.lexstaterenames[key][i]))) 197*4882a593Smuzhiyun tabre[key] = titem 198*4882a593Smuzhiyun 199*4882a593Smuzhiyun tf.write("_lexstatere = %s\n" % repr(tabre)) 200*4882a593Smuzhiyun tf.write("_lexstateignore = %s\n" % repr(self.lexstateignore)) 201*4882a593Smuzhiyun 202*4882a593Smuzhiyun taberr = { } 203*4882a593Smuzhiyun for key, ef in self.lexstateerrorf.items(): 204*4882a593Smuzhiyun if ef: 205*4882a593Smuzhiyun taberr[key] = ef.__name__ 206*4882a593Smuzhiyun else: 207*4882a593Smuzhiyun taberr[key] = None 208*4882a593Smuzhiyun tf.write("_lexstateerrorf = %s\n" % repr(taberr)) 209*4882a593Smuzhiyun tf.close() 210*4882a593Smuzhiyun 211*4882a593Smuzhiyun # ------------------------------------------------------------ 212*4882a593Smuzhiyun # readtab() - Read lexer information from a tab file 213*4882a593Smuzhiyun # ------------------------------------------------------------ 214*4882a593Smuzhiyun def readtab(self,tabfile,fdict): 215*4882a593Smuzhiyun if isinstance(tabfile,types.ModuleType): 216*4882a593Smuzhiyun lextab = tabfile 217*4882a593Smuzhiyun else: 218*4882a593Smuzhiyun if sys.version_info[0] < 3: 219*4882a593Smuzhiyun exec("import %s as lextab" % tabfile) 220*4882a593Smuzhiyun else: 221*4882a593Smuzhiyun env = { } 222*4882a593Smuzhiyun exec("import %s as lextab" % tabfile, env,env) 223*4882a593Smuzhiyun lextab = env['lextab'] 224*4882a593Smuzhiyun 225*4882a593Smuzhiyun if getattr(lextab,"_tabversion","0.0") != __version__: 226*4882a593Smuzhiyun raise ImportError("Inconsistent PLY version") 227*4882a593Smuzhiyun 228*4882a593Smuzhiyun self.lextokens = lextab._lextokens 229*4882a593Smuzhiyun self.lexreflags = lextab._lexreflags 230*4882a593Smuzhiyun self.lexliterals = lextab._lexliterals 231*4882a593Smuzhiyun self.lexstateinfo = lextab._lexstateinfo 232*4882a593Smuzhiyun self.lexstateignore = lextab._lexstateignore 233*4882a593Smuzhiyun self.lexstatere = { } 234*4882a593Smuzhiyun self.lexstateretext = { } 235*4882a593Smuzhiyun for key,lre in lextab._lexstatere.items(): 236*4882a593Smuzhiyun titem = [] 237*4882a593Smuzhiyun txtitem = [] 238*4882a593Smuzhiyun for i in range(len(lre)): 239*4882a593Smuzhiyun titem.append((re.compile(lre[i][0],lextab._lexreflags | re.VERBOSE),_names_to_funcs(lre[i][1],fdict))) 240*4882a593Smuzhiyun txtitem.append(lre[i][0]) 241*4882a593Smuzhiyun self.lexstatere[key] = titem 242*4882a593Smuzhiyun self.lexstateretext[key] = txtitem 243*4882a593Smuzhiyun self.lexstateerrorf = { } 244*4882a593Smuzhiyun for key,ef in lextab._lexstateerrorf.items(): 245*4882a593Smuzhiyun self.lexstateerrorf[key] = fdict[ef] 246*4882a593Smuzhiyun self.begin('INITIAL') 247*4882a593Smuzhiyun 248*4882a593Smuzhiyun # ------------------------------------------------------------ 249*4882a593Smuzhiyun # input() - Push a new string into the lexer 250*4882a593Smuzhiyun # ------------------------------------------------------------ 251*4882a593Smuzhiyun def input(self,s): 252*4882a593Smuzhiyun # Pull off the first character to see if s looks like a string 253*4882a593Smuzhiyun c = s[:1] 254*4882a593Smuzhiyun if not isinstance(c,StringTypes): 255*4882a593Smuzhiyun raise ValueError("Expected a string") 256*4882a593Smuzhiyun self.lexdata = s 257*4882a593Smuzhiyun self.lexpos = 0 258*4882a593Smuzhiyun self.lexlen = len(s) 259*4882a593Smuzhiyun 260*4882a593Smuzhiyun # ------------------------------------------------------------ 261*4882a593Smuzhiyun # begin() - Changes the lexing state 262*4882a593Smuzhiyun # ------------------------------------------------------------ 263*4882a593Smuzhiyun def begin(self,state): 264*4882a593Smuzhiyun if not state in self.lexstatere: 265*4882a593Smuzhiyun raise ValueError("Undefined state") 266*4882a593Smuzhiyun self.lexre = self.lexstatere[state] 267*4882a593Smuzhiyun self.lexretext = self.lexstateretext[state] 268*4882a593Smuzhiyun self.lexignore = self.lexstateignore.get(state,"") 269*4882a593Smuzhiyun self.lexerrorf = self.lexstateerrorf.get(state,None) 270*4882a593Smuzhiyun self.lexstate = state 271*4882a593Smuzhiyun 272*4882a593Smuzhiyun # ------------------------------------------------------------ 273*4882a593Smuzhiyun # push_state() - Changes the lexing state and saves old on stack 274*4882a593Smuzhiyun # ------------------------------------------------------------ 275*4882a593Smuzhiyun def push_state(self,state): 276*4882a593Smuzhiyun self.lexstatestack.append(self.lexstate) 277*4882a593Smuzhiyun self.begin(state) 278*4882a593Smuzhiyun 279*4882a593Smuzhiyun # ------------------------------------------------------------ 280*4882a593Smuzhiyun # pop_state() - Restores the previous state 281*4882a593Smuzhiyun # ------------------------------------------------------------ 282*4882a593Smuzhiyun def pop_state(self): 283*4882a593Smuzhiyun self.begin(self.lexstatestack.pop()) 284*4882a593Smuzhiyun 285*4882a593Smuzhiyun # ------------------------------------------------------------ 286*4882a593Smuzhiyun # current_state() - Returns the current lexing state 287*4882a593Smuzhiyun # ------------------------------------------------------------ 288*4882a593Smuzhiyun def current_state(self): 289*4882a593Smuzhiyun return self.lexstate 290*4882a593Smuzhiyun 291*4882a593Smuzhiyun # ------------------------------------------------------------ 292*4882a593Smuzhiyun # skip() - Skip ahead n characters 293*4882a593Smuzhiyun # ------------------------------------------------------------ 294*4882a593Smuzhiyun def skip(self,n): 295*4882a593Smuzhiyun self.lexpos += n 296*4882a593Smuzhiyun 297*4882a593Smuzhiyun # ------------------------------------------------------------ 298*4882a593Smuzhiyun # opttoken() - Return the next token from the Lexer 299*4882a593Smuzhiyun # 300*4882a593Smuzhiyun # Note: This function has been carefully implemented to be as fast 301*4882a593Smuzhiyun # as possible. Don't make changes unless you really know what 302*4882a593Smuzhiyun # you are doing 303*4882a593Smuzhiyun # ------------------------------------------------------------ 304*4882a593Smuzhiyun def token(self): 305*4882a593Smuzhiyun # Make local copies of frequently referenced attributes 306*4882a593Smuzhiyun lexpos = self.lexpos 307*4882a593Smuzhiyun lexlen = self.lexlen 308*4882a593Smuzhiyun lexignore = self.lexignore 309*4882a593Smuzhiyun lexdata = self.lexdata 310*4882a593Smuzhiyun 311*4882a593Smuzhiyun while lexpos < lexlen: 312*4882a593Smuzhiyun # This code provides some short-circuit code for whitespace, tabs, and other ignored characters 313*4882a593Smuzhiyun if lexdata[lexpos] in lexignore: 314*4882a593Smuzhiyun lexpos += 1 315*4882a593Smuzhiyun continue 316*4882a593Smuzhiyun 317*4882a593Smuzhiyun # Look for a regular expression match 318*4882a593Smuzhiyun for lexre,lexindexfunc in self.lexre: 319*4882a593Smuzhiyun m = lexre.match(lexdata,lexpos) 320*4882a593Smuzhiyun if not m: continue 321*4882a593Smuzhiyun 322*4882a593Smuzhiyun # Create a token for return 323*4882a593Smuzhiyun tok = LexToken() 324*4882a593Smuzhiyun tok.value = m.group() 325*4882a593Smuzhiyun tok.lineno = self.lineno 326*4882a593Smuzhiyun tok.lexpos = lexpos 327*4882a593Smuzhiyun 328*4882a593Smuzhiyun i = m.lastindex 329*4882a593Smuzhiyun func,tok.type = lexindexfunc[i] 330*4882a593Smuzhiyun 331*4882a593Smuzhiyun if not func: 332*4882a593Smuzhiyun # If no token type was set, it's an ignored token 333*4882a593Smuzhiyun if tok.type: 334*4882a593Smuzhiyun self.lexpos = m.end() 335*4882a593Smuzhiyun return tok 336*4882a593Smuzhiyun else: 337*4882a593Smuzhiyun lexpos = m.end() 338*4882a593Smuzhiyun break 339*4882a593Smuzhiyun 340*4882a593Smuzhiyun lexpos = m.end() 341*4882a593Smuzhiyun 342*4882a593Smuzhiyun # If token is processed by a function, call it 343*4882a593Smuzhiyun 344*4882a593Smuzhiyun tok.lexer = self # Set additional attributes useful in token rules 345*4882a593Smuzhiyun self.lexmatch = m 346*4882a593Smuzhiyun self.lexpos = lexpos 347*4882a593Smuzhiyun 348*4882a593Smuzhiyun newtok = func(tok) 349*4882a593Smuzhiyun 350*4882a593Smuzhiyun # Every function must return a token, if nothing, we just move to next token 351*4882a593Smuzhiyun if not newtok: 352*4882a593Smuzhiyun lexpos = self.lexpos # This is here in case user has updated lexpos. 353*4882a593Smuzhiyun lexignore = self.lexignore # This is here in case there was a state change 354*4882a593Smuzhiyun break 355*4882a593Smuzhiyun 356*4882a593Smuzhiyun # Verify type of the token. If not in the token map, raise an error 357*4882a593Smuzhiyun if not self.lexoptimize: 358*4882a593Smuzhiyun if not newtok.type in self.lextokens: 359*4882a593Smuzhiyun raise LexError("%s:%d: Rule '%s' returned an unknown token type '%s'" % ( 360*4882a593Smuzhiyun func_code(func).co_filename, func_code(func).co_firstlineno, 361*4882a593Smuzhiyun func.__name__, newtok.type),lexdata[lexpos:]) 362*4882a593Smuzhiyun 363*4882a593Smuzhiyun return newtok 364*4882a593Smuzhiyun else: 365*4882a593Smuzhiyun # No match, see if in literals 366*4882a593Smuzhiyun if lexdata[lexpos] in self.lexliterals: 367*4882a593Smuzhiyun tok = LexToken() 368*4882a593Smuzhiyun tok.value = lexdata[lexpos] 369*4882a593Smuzhiyun tok.lineno = self.lineno 370*4882a593Smuzhiyun tok.type = tok.value 371*4882a593Smuzhiyun tok.lexpos = lexpos 372*4882a593Smuzhiyun self.lexpos = lexpos + 1 373*4882a593Smuzhiyun return tok 374*4882a593Smuzhiyun 375*4882a593Smuzhiyun # No match. Call t_error() if defined. 376*4882a593Smuzhiyun if self.lexerrorf: 377*4882a593Smuzhiyun tok = LexToken() 378*4882a593Smuzhiyun tok.value = self.lexdata[lexpos:] 379*4882a593Smuzhiyun tok.lineno = self.lineno 380*4882a593Smuzhiyun tok.type = "error" 381*4882a593Smuzhiyun tok.lexer = self 382*4882a593Smuzhiyun tok.lexpos = lexpos 383*4882a593Smuzhiyun self.lexpos = lexpos 384*4882a593Smuzhiyun newtok = self.lexerrorf(tok) 385*4882a593Smuzhiyun if lexpos == self.lexpos: 386*4882a593Smuzhiyun # Error method didn't change text position at all. This is an error. 387*4882a593Smuzhiyun raise LexError("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:]) 388*4882a593Smuzhiyun lexpos = self.lexpos 389*4882a593Smuzhiyun if not newtok: continue 390*4882a593Smuzhiyun return newtok 391*4882a593Smuzhiyun 392*4882a593Smuzhiyun self.lexpos = lexpos 393*4882a593Smuzhiyun raise LexError("Illegal character '%s' at index %d" % (lexdata[lexpos],lexpos), lexdata[lexpos:]) 394*4882a593Smuzhiyun 395*4882a593Smuzhiyun self.lexpos = lexpos + 1 396*4882a593Smuzhiyun if self.lexdata is None: 397*4882a593Smuzhiyun raise RuntimeError("No input string given with input()") 398*4882a593Smuzhiyun return None 399*4882a593Smuzhiyun 400*4882a593Smuzhiyun # Iterator interface 401*4882a593Smuzhiyun def __iter__(self): 402*4882a593Smuzhiyun return self 403*4882a593Smuzhiyun 404*4882a593Smuzhiyun def next(self): 405*4882a593Smuzhiyun t = self.token() 406*4882a593Smuzhiyun if t is None: 407*4882a593Smuzhiyun raise StopIteration 408*4882a593Smuzhiyun return t 409*4882a593Smuzhiyun 410*4882a593Smuzhiyun __next__ = next 411*4882a593Smuzhiyun 412*4882a593Smuzhiyun# ----------------------------------------------------------------------------- 413*4882a593Smuzhiyun# ==== Lex Builder === 414*4882a593Smuzhiyun# 415*4882a593Smuzhiyun# The functions and classes below are used to collect lexing information 416*4882a593Smuzhiyun# and build a Lexer object from it. 417*4882a593Smuzhiyun# ----------------------------------------------------------------------------- 418*4882a593Smuzhiyun 419*4882a593Smuzhiyun# ----------------------------------------------------------------------------- 420*4882a593Smuzhiyun# get_caller_module_dict() 421*4882a593Smuzhiyun# 422*4882a593Smuzhiyun# This function returns a dictionary containing all of the symbols defined within 423*4882a593Smuzhiyun# a caller further down the call stack. This is used to get the environment 424*4882a593Smuzhiyun# associated with the yacc() call if none was provided. 425*4882a593Smuzhiyun# ----------------------------------------------------------------------------- 426*4882a593Smuzhiyun 427*4882a593Smuzhiyundef get_caller_module_dict(levels): 428*4882a593Smuzhiyun try: 429*4882a593Smuzhiyun raise RuntimeError 430*4882a593Smuzhiyun except RuntimeError: 431*4882a593Smuzhiyun e,b,t = sys.exc_info() 432*4882a593Smuzhiyun f = t.tb_frame 433*4882a593Smuzhiyun while levels > 0: 434*4882a593Smuzhiyun f = f.f_back 435*4882a593Smuzhiyun levels -= 1 436*4882a593Smuzhiyun ldict = f.f_globals.copy() 437*4882a593Smuzhiyun if f.f_globals != f.f_locals: 438*4882a593Smuzhiyun ldict.update(f.f_locals) 439*4882a593Smuzhiyun 440*4882a593Smuzhiyun return ldict 441*4882a593Smuzhiyun 442*4882a593Smuzhiyun# ----------------------------------------------------------------------------- 443*4882a593Smuzhiyun# _funcs_to_names() 444*4882a593Smuzhiyun# 445*4882a593Smuzhiyun# Given a list of regular expression functions, this converts it to a list 446*4882a593Smuzhiyun# suitable for output to a table file 447*4882a593Smuzhiyun# ----------------------------------------------------------------------------- 448*4882a593Smuzhiyun 449*4882a593Smuzhiyundef _funcs_to_names(funclist,namelist): 450*4882a593Smuzhiyun result = [] 451*4882a593Smuzhiyun for f,name in zip(funclist,namelist): 452*4882a593Smuzhiyun if f and f[0]: 453*4882a593Smuzhiyun result.append((name, f[1])) 454*4882a593Smuzhiyun else: 455*4882a593Smuzhiyun result.append(f) 456*4882a593Smuzhiyun return result 457*4882a593Smuzhiyun 458*4882a593Smuzhiyun# ----------------------------------------------------------------------------- 459*4882a593Smuzhiyun# _names_to_funcs() 460*4882a593Smuzhiyun# 461*4882a593Smuzhiyun# Given a list of regular expression function names, this converts it back to 462*4882a593Smuzhiyun# functions. 463*4882a593Smuzhiyun# ----------------------------------------------------------------------------- 464*4882a593Smuzhiyun 465*4882a593Smuzhiyundef _names_to_funcs(namelist,fdict): 466*4882a593Smuzhiyun result = [] 467*4882a593Smuzhiyun for n in namelist: 468*4882a593Smuzhiyun if n and n[0]: 469*4882a593Smuzhiyun result.append((fdict[n[0]],n[1])) 470*4882a593Smuzhiyun else: 471*4882a593Smuzhiyun result.append(n) 472*4882a593Smuzhiyun return result 473*4882a593Smuzhiyun 474*4882a593Smuzhiyun# ----------------------------------------------------------------------------- 475*4882a593Smuzhiyun# _form_master_re() 476*4882a593Smuzhiyun# 477*4882a593Smuzhiyun# This function takes a list of all of the regex components and attempts to 478*4882a593Smuzhiyun# form the master regular expression. Given limitations in the Python re 479*4882a593Smuzhiyun# module, it may be necessary to break the master regex into separate expressions. 480*4882a593Smuzhiyun# ----------------------------------------------------------------------------- 481*4882a593Smuzhiyun 482*4882a593Smuzhiyundef _form_master_re(relist,reflags,ldict,toknames): 483*4882a593Smuzhiyun if not relist: return [] 484*4882a593Smuzhiyun regex = "|".join(relist) 485*4882a593Smuzhiyun try: 486*4882a593Smuzhiyun lexre = re.compile(regex,re.VERBOSE | reflags) 487*4882a593Smuzhiyun 488*4882a593Smuzhiyun # Build the index to function map for the matching engine 489*4882a593Smuzhiyun lexindexfunc = [ None ] * (max(lexre.groupindex.values())+1) 490*4882a593Smuzhiyun lexindexnames = lexindexfunc[:] 491*4882a593Smuzhiyun 492*4882a593Smuzhiyun for f,i in lexre.groupindex.items(): 493*4882a593Smuzhiyun handle = ldict.get(f,None) 494*4882a593Smuzhiyun if type(handle) in (types.FunctionType, types.MethodType): 495*4882a593Smuzhiyun lexindexfunc[i] = (handle,toknames[f]) 496*4882a593Smuzhiyun lexindexnames[i] = f 497*4882a593Smuzhiyun elif handle is not None: 498*4882a593Smuzhiyun lexindexnames[i] = f 499*4882a593Smuzhiyun if f.find("ignore_") > 0: 500*4882a593Smuzhiyun lexindexfunc[i] = (None,None) 501*4882a593Smuzhiyun else: 502*4882a593Smuzhiyun lexindexfunc[i] = (None, toknames[f]) 503*4882a593Smuzhiyun 504*4882a593Smuzhiyun return [(lexre,lexindexfunc)],[regex],[lexindexnames] 505*4882a593Smuzhiyun except Exception: 506*4882a593Smuzhiyun m = int(len(relist)/2) 507*4882a593Smuzhiyun if m == 0: m = 1 508*4882a593Smuzhiyun llist, lre, lnames = _form_master_re(relist[:m],reflags,ldict,toknames) 509*4882a593Smuzhiyun rlist, rre, rnames = _form_master_re(relist[m:],reflags,ldict,toknames) 510*4882a593Smuzhiyun return llist+rlist, lre+rre, lnames+rnames 511*4882a593Smuzhiyun 512*4882a593Smuzhiyun# ----------------------------------------------------------------------------- 513*4882a593Smuzhiyun# def _statetoken(s,names) 514*4882a593Smuzhiyun# 515*4882a593Smuzhiyun# Given a declaration name s of the form "t_" and a dictionary whose keys are 516*4882a593Smuzhiyun# state names, this function returns a tuple (states,tokenname) where states 517*4882a593Smuzhiyun# is a tuple of state names and tokenname is the name of the token. For example, 518*4882a593Smuzhiyun# calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM') 519*4882a593Smuzhiyun# ----------------------------------------------------------------------------- 520*4882a593Smuzhiyun 521*4882a593Smuzhiyundef _statetoken(s,names): 522*4882a593Smuzhiyun nonstate = 1 523*4882a593Smuzhiyun parts = s.split("_") 524*4882a593Smuzhiyun for i in range(1,len(parts)): 525*4882a593Smuzhiyun if not parts[i] in names and parts[i] != 'ANY': break 526*4882a593Smuzhiyun if i > 1: 527*4882a593Smuzhiyun states = tuple(parts[1:i]) 528*4882a593Smuzhiyun else: 529*4882a593Smuzhiyun states = ('INITIAL',) 530*4882a593Smuzhiyun 531*4882a593Smuzhiyun if 'ANY' in states: 532*4882a593Smuzhiyun states = tuple(names) 533*4882a593Smuzhiyun 534*4882a593Smuzhiyun tokenname = "_".join(parts[i:]) 535*4882a593Smuzhiyun return (states,tokenname) 536*4882a593Smuzhiyun 537*4882a593Smuzhiyun 538*4882a593Smuzhiyun# ----------------------------------------------------------------------------- 539*4882a593Smuzhiyun# LexerReflect() 540*4882a593Smuzhiyun# 541*4882a593Smuzhiyun# This class represents information needed to build a lexer as extracted from a 542*4882a593Smuzhiyun# user's input file. 543*4882a593Smuzhiyun# ----------------------------------------------------------------------------- 544*4882a593Smuzhiyunclass LexerReflect(object): 545*4882a593Smuzhiyun def __init__(self,ldict,log=None,reflags=0): 546*4882a593Smuzhiyun self.ldict = ldict 547*4882a593Smuzhiyun self.error_func = None 548*4882a593Smuzhiyun self.tokens = [] 549*4882a593Smuzhiyun self.reflags = reflags 550*4882a593Smuzhiyun self.stateinfo = { 'INITIAL' : 'inclusive'} 551*4882a593Smuzhiyun self.files = {} 552*4882a593Smuzhiyun self.error = 0 553*4882a593Smuzhiyun 554*4882a593Smuzhiyun if log is None: 555*4882a593Smuzhiyun self.log = PlyLogger(sys.stderr) 556*4882a593Smuzhiyun else: 557*4882a593Smuzhiyun self.log = log 558*4882a593Smuzhiyun 559*4882a593Smuzhiyun # Get all of the basic information 560*4882a593Smuzhiyun def get_all(self): 561*4882a593Smuzhiyun self.get_tokens() 562*4882a593Smuzhiyun self.get_literals() 563*4882a593Smuzhiyun self.get_states() 564*4882a593Smuzhiyun self.get_rules() 565*4882a593Smuzhiyun 566*4882a593Smuzhiyun # Validate all of the information 567*4882a593Smuzhiyun def validate_all(self): 568*4882a593Smuzhiyun self.validate_tokens() 569*4882a593Smuzhiyun self.validate_literals() 570*4882a593Smuzhiyun self.validate_rules() 571*4882a593Smuzhiyun return self.error 572*4882a593Smuzhiyun 573*4882a593Smuzhiyun # Get the tokens map 574*4882a593Smuzhiyun def get_tokens(self): 575*4882a593Smuzhiyun tokens = self.ldict.get("tokens",None) 576*4882a593Smuzhiyun if not tokens: 577*4882a593Smuzhiyun self.log.error("No token list is defined") 578*4882a593Smuzhiyun self.error = 1 579*4882a593Smuzhiyun return 580*4882a593Smuzhiyun 581*4882a593Smuzhiyun if not isinstance(tokens,(list, tuple)): 582*4882a593Smuzhiyun self.log.error("tokens must be a list or tuple") 583*4882a593Smuzhiyun self.error = 1 584*4882a593Smuzhiyun return 585*4882a593Smuzhiyun 586*4882a593Smuzhiyun if not tokens: 587*4882a593Smuzhiyun self.log.error("tokens is empty") 588*4882a593Smuzhiyun self.error = 1 589*4882a593Smuzhiyun return 590*4882a593Smuzhiyun 591*4882a593Smuzhiyun self.tokens = tokens 592*4882a593Smuzhiyun 593*4882a593Smuzhiyun # Validate the tokens 594*4882a593Smuzhiyun def validate_tokens(self): 595*4882a593Smuzhiyun terminals = {} 596*4882a593Smuzhiyun for n in self.tokens: 597*4882a593Smuzhiyun if not _is_identifier.match(n): 598*4882a593Smuzhiyun self.log.error("Bad token name '%s'",n) 599*4882a593Smuzhiyun self.error = 1 600*4882a593Smuzhiyun if n in terminals: 601*4882a593Smuzhiyun self.log.warning("Token '%s' multiply defined", n) 602*4882a593Smuzhiyun terminals[n] = 1 603*4882a593Smuzhiyun 604*4882a593Smuzhiyun # Get the literals specifier 605*4882a593Smuzhiyun def get_literals(self): 606*4882a593Smuzhiyun self.literals = self.ldict.get("literals","") 607*4882a593Smuzhiyun 608*4882a593Smuzhiyun # Validate literals 609*4882a593Smuzhiyun def validate_literals(self): 610*4882a593Smuzhiyun try: 611*4882a593Smuzhiyun for c in self.literals: 612*4882a593Smuzhiyun if not isinstance(c,StringTypes) or len(c) > 1: 613*4882a593Smuzhiyun self.log.error("Invalid literal %s. Must be a single character", repr(c)) 614*4882a593Smuzhiyun self.error = 1 615*4882a593Smuzhiyun continue 616*4882a593Smuzhiyun 617*4882a593Smuzhiyun except TypeError: 618*4882a593Smuzhiyun self.log.error("Invalid literals specification. literals must be a sequence of characters") 619*4882a593Smuzhiyun self.error = 1 620*4882a593Smuzhiyun 621*4882a593Smuzhiyun def get_states(self): 622*4882a593Smuzhiyun self.states = self.ldict.get("states",None) 623*4882a593Smuzhiyun # Build statemap 624*4882a593Smuzhiyun if self.states: 625*4882a593Smuzhiyun if not isinstance(self.states,(tuple,list)): 626*4882a593Smuzhiyun self.log.error("states must be defined as a tuple or list") 627*4882a593Smuzhiyun self.error = 1 628*4882a593Smuzhiyun else: 629*4882a593Smuzhiyun for s in self.states: 630*4882a593Smuzhiyun if not isinstance(s,tuple) or len(s) != 2: 631*4882a593Smuzhiyun self.log.error("Invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')",repr(s)) 632*4882a593Smuzhiyun self.error = 1 633*4882a593Smuzhiyun continue 634*4882a593Smuzhiyun name, statetype = s 635*4882a593Smuzhiyun if not isinstance(name,StringTypes): 636*4882a593Smuzhiyun self.log.error("State name %s must be a string", repr(name)) 637*4882a593Smuzhiyun self.error = 1 638*4882a593Smuzhiyun continue 639*4882a593Smuzhiyun if not (statetype == 'inclusive' or statetype == 'exclusive'): 640*4882a593Smuzhiyun self.log.error("State type for state %s must be 'inclusive' or 'exclusive'",name) 641*4882a593Smuzhiyun self.error = 1 642*4882a593Smuzhiyun continue 643*4882a593Smuzhiyun if name in self.stateinfo: 644*4882a593Smuzhiyun self.log.error("State '%s' already defined",name) 645*4882a593Smuzhiyun self.error = 1 646*4882a593Smuzhiyun continue 647*4882a593Smuzhiyun self.stateinfo[name] = statetype 648*4882a593Smuzhiyun 649*4882a593Smuzhiyun # Get all of the symbols with a t_ prefix and sort them into various 650*4882a593Smuzhiyun # categories (functions, strings, error functions, and ignore characters) 651*4882a593Smuzhiyun 652*4882a593Smuzhiyun def get_rules(self): 653*4882a593Smuzhiyun tsymbols = [f for f in self.ldict if f[:2] == 't_' ] 654*4882a593Smuzhiyun 655*4882a593Smuzhiyun # Now build up a list of functions and a list of strings 656*4882a593Smuzhiyun 657*4882a593Smuzhiyun self.toknames = { } # Mapping of symbols to token names 658*4882a593Smuzhiyun self.funcsym = { } # Symbols defined as functions 659*4882a593Smuzhiyun self.strsym = { } # Symbols defined as strings 660*4882a593Smuzhiyun self.ignore = { } # Ignore strings by state 661*4882a593Smuzhiyun self.errorf = { } # Error functions by state 662*4882a593Smuzhiyun 663*4882a593Smuzhiyun for s in self.stateinfo: 664*4882a593Smuzhiyun self.funcsym[s] = [] 665*4882a593Smuzhiyun self.strsym[s] = [] 666*4882a593Smuzhiyun 667*4882a593Smuzhiyun if len(tsymbols) == 0: 668*4882a593Smuzhiyun self.log.error("No rules of the form t_rulename are defined") 669*4882a593Smuzhiyun self.error = 1 670*4882a593Smuzhiyun return 671*4882a593Smuzhiyun 672*4882a593Smuzhiyun for f in tsymbols: 673*4882a593Smuzhiyun t = self.ldict[f] 674*4882a593Smuzhiyun states, tokname = _statetoken(f,self.stateinfo) 675*4882a593Smuzhiyun self.toknames[f] = tokname 676*4882a593Smuzhiyun 677*4882a593Smuzhiyun if hasattr(t,"__call__"): 678*4882a593Smuzhiyun if tokname == 'error': 679*4882a593Smuzhiyun for s in states: 680*4882a593Smuzhiyun self.errorf[s] = t 681*4882a593Smuzhiyun elif tokname == 'ignore': 682*4882a593Smuzhiyun line = func_code(t).co_firstlineno 683*4882a593Smuzhiyun file = func_code(t).co_filename 684*4882a593Smuzhiyun self.log.error("%s:%d: Rule '%s' must be defined as a string",file,line,t.__name__) 685*4882a593Smuzhiyun self.error = 1 686*4882a593Smuzhiyun else: 687*4882a593Smuzhiyun for s in states: 688*4882a593Smuzhiyun self.funcsym[s].append((f,t)) 689*4882a593Smuzhiyun elif isinstance(t, StringTypes): 690*4882a593Smuzhiyun if tokname == 'ignore': 691*4882a593Smuzhiyun for s in states: 692*4882a593Smuzhiyun self.ignore[s] = t 693*4882a593Smuzhiyun if "\\" in t: 694*4882a593Smuzhiyun self.log.warning("%s contains a literal backslash '\\'",f) 695*4882a593Smuzhiyun 696*4882a593Smuzhiyun elif tokname == 'error': 697*4882a593Smuzhiyun self.log.error("Rule '%s' must be defined as a function", f) 698*4882a593Smuzhiyun self.error = 1 699*4882a593Smuzhiyun else: 700*4882a593Smuzhiyun for s in states: 701*4882a593Smuzhiyun self.strsym[s].append((f,t)) 702*4882a593Smuzhiyun else: 703*4882a593Smuzhiyun self.log.error("%s not defined as a function or string", f) 704*4882a593Smuzhiyun self.error = 1 705*4882a593Smuzhiyun 706*4882a593Smuzhiyun # Sort the functions by line number 707*4882a593Smuzhiyun for f in self.funcsym.values(): 708*4882a593Smuzhiyun f.sort(key=lambda x: func_code(x[1]).co_firstlineno) 709*4882a593Smuzhiyun 710*4882a593Smuzhiyun # Sort the strings by regular expression length 711*4882a593Smuzhiyun for s in self.strsym.values(): 712*4882a593Smuzhiyun if sys.version_info[0] < 3: 713*4882a593Smuzhiyun s.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1]))) 714*4882a593Smuzhiyun else: 715*4882a593Smuzhiyun # Python 3.0 716*4882a593Smuzhiyun s.sort(key=lambda x: len(x[1]),reverse=True) 717*4882a593Smuzhiyun 718*4882a593Smuzhiyun # Validate all of the t_rules collected 719*4882a593Smuzhiyun def validate_rules(self): 720*4882a593Smuzhiyun for state in self.stateinfo: 721*4882a593Smuzhiyun # Validate all rules defined by functions 722*4882a593Smuzhiyun 723*4882a593Smuzhiyun 724*4882a593Smuzhiyun 725*4882a593Smuzhiyun for fname, f in self.funcsym[state]: 726*4882a593Smuzhiyun line = func_code(f).co_firstlineno 727*4882a593Smuzhiyun file = func_code(f).co_filename 728*4882a593Smuzhiyun self.files[file] = 1 729*4882a593Smuzhiyun 730*4882a593Smuzhiyun tokname = self.toknames[fname] 731*4882a593Smuzhiyun if isinstance(f, types.MethodType): 732*4882a593Smuzhiyun reqargs = 2 733*4882a593Smuzhiyun else: 734*4882a593Smuzhiyun reqargs = 1 735*4882a593Smuzhiyun nargs = func_code(f).co_argcount 736*4882a593Smuzhiyun if nargs > reqargs: 737*4882a593Smuzhiyun self.log.error("%s:%d: Rule '%s' has too many arguments",file,line,f.__name__) 738*4882a593Smuzhiyun self.error = 1 739*4882a593Smuzhiyun continue 740*4882a593Smuzhiyun 741*4882a593Smuzhiyun if nargs < reqargs: 742*4882a593Smuzhiyun self.log.error("%s:%d: Rule '%s' requires an argument", file,line,f.__name__) 743*4882a593Smuzhiyun self.error = 1 744*4882a593Smuzhiyun continue 745*4882a593Smuzhiyun 746*4882a593Smuzhiyun if not f.__doc__: 747*4882a593Smuzhiyun self.log.error("%s:%d: No regular expression defined for rule '%s'",file,line,f.__name__) 748*4882a593Smuzhiyun self.error = 1 749*4882a593Smuzhiyun continue 750*4882a593Smuzhiyun 751*4882a593Smuzhiyun try: 752*4882a593Smuzhiyun c = re.compile("(?P<%s>%s)" % (fname,f.__doc__), re.VERBOSE | self.reflags) 753*4882a593Smuzhiyun if c.match(""): 754*4882a593Smuzhiyun self.log.error("%s:%d: Regular expression for rule '%s' matches empty string", file,line,f.__name__) 755*4882a593Smuzhiyun self.error = 1 756*4882a593Smuzhiyun except re.error: 757*4882a593Smuzhiyun _etype, e, _etrace = sys.exc_info() 758*4882a593Smuzhiyun self.log.error("%s:%d: Invalid regular expression for rule '%s'. %s", file,line,f.__name__,e) 759*4882a593Smuzhiyun if '#' in f.__doc__: 760*4882a593Smuzhiyun self.log.error("%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'",file,line, f.__name__) 761*4882a593Smuzhiyun self.error = 1 762*4882a593Smuzhiyun 763*4882a593Smuzhiyun # Validate all rules defined by strings 764*4882a593Smuzhiyun for name,r in self.strsym[state]: 765*4882a593Smuzhiyun tokname = self.toknames[name] 766*4882a593Smuzhiyun if tokname == 'error': 767*4882a593Smuzhiyun self.log.error("Rule '%s' must be defined as a function", name) 768*4882a593Smuzhiyun self.error = 1 769*4882a593Smuzhiyun continue 770*4882a593Smuzhiyun 771*4882a593Smuzhiyun if not tokname in self.tokens and tokname.find("ignore_") < 0: 772*4882a593Smuzhiyun self.log.error("Rule '%s' defined for an unspecified token %s",name,tokname) 773*4882a593Smuzhiyun self.error = 1 774*4882a593Smuzhiyun continue 775*4882a593Smuzhiyun 776*4882a593Smuzhiyun try: 777*4882a593Smuzhiyun c = re.compile("(?P<%s>%s)" % (name,r),re.VERBOSE | self.reflags) 778*4882a593Smuzhiyun if (c.match("")): 779*4882a593Smuzhiyun self.log.error("Regular expression for rule '%s' matches empty string",name) 780*4882a593Smuzhiyun self.error = 1 781*4882a593Smuzhiyun except re.error: 782*4882a593Smuzhiyun _etype, e, _etrace = sys.exc_info() 783*4882a593Smuzhiyun self.log.error("Invalid regular expression for rule '%s'. %s",name,e) 784*4882a593Smuzhiyun if '#' in r: 785*4882a593Smuzhiyun self.log.error("Make sure '#' in rule '%s' is escaped with '\\#'",name) 786*4882a593Smuzhiyun self.error = 1 787*4882a593Smuzhiyun 788*4882a593Smuzhiyun if not self.funcsym[state] and not self.strsym[state]: 789*4882a593Smuzhiyun self.log.error("No rules defined for state '%s'",state) 790*4882a593Smuzhiyun self.error = 1 791*4882a593Smuzhiyun 792*4882a593Smuzhiyun # Validate the error function 793*4882a593Smuzhiyun efunc = self.errorf.get(state,None) 794*4882a593Smuzhiyun if efunc: 795*4882a593Smuzhiyun f = efunc 796*4882a593Smuzhiyun line = func_code(f).co_firstlineno 797*4882a593Smuzhiyun file = func_code(f).co_filename 798*4882a593Smuzhiyun self.files[file] = 1 799*4882a593Smuzhiyun 800*4882a593Smuzhiyun if isinstance(f, types.MethodType): 801*4882a593Smuzhiyun reqargs = 2 802*4882a593Smuzhiyun else: 803*4882a593Smuzhiyun reqargs = 1 804*4882a593Smuzhiyun nargs = func_code(f).co_argcount 805*4882a593Smuzhiyun if nargs > reqargs: 806*4882a593Smuzhiyun self.log.error("%s:%d: Rule '%s' has too many arguments",file,line,f.__name__) 807*4882a593Smuzhiyun self.error = 1 808*4882a593Smuzhiyun 809*4882a593Smuzhiyun if nargs < reqargs: 810*4882a593Smuzhiyun self.log.error("%s:%d: Rule '%s' requires an argument", file,line,f.__name__) 811*4882a593Smuzhiyun self.error = 1 812*4882a593Smuzhiyun 813*4882a593Smuzhiyun for f in self.files: 814*4882a593Smuzhiyun self.validate_file(f) 815*4882a593Smuzhiyun 816*4882a593Smuzhiyun 817*4882a593Smuzhiyun # ----------------------------------------------------------------------------- 818*4882a593Smuzhiyun # validate_file() 819*4882a593Smuzhiyun # 820*4882a593Smuzhiyun # This checks to see if there are duplicated t_rulename() functions or strings 821*4882a593Smuzhiyun # in the parser input file. This is done using a simple regular expression 822*4882a593Smuzhiyun # match on each line in the given file. 823*4882a593Smuzhiyun # ----------------------------------------------------------------------------- 824*4882a593Smuzhiyun 825*4882a593Smuzhiyun def validate_file(self,filename): 826*4882a593Smuzhiyun import os.path 827*4882a593Smuzhiyun base,ext = os.path.splitext(filename) 828*4882a593Smuzhiyun if ext != '.py': return # No idea what the file is. Return OK 829*4882a593Smuzhiyun 830*4882a593Smuzhiyun try: 831*4882a593Smuzhiyun f = open(filename) 832*4882a593Smuzhiyun lines = f.readlines() 833*4882a593Smuzhiyun f.close() 834*4882a593Smuzhiyun except IOError: 835*4882a593Smuzhiyun return # Couldn't find the file. Don't worry about it 836*4882a593Smuzhiyun 837*4882a593Smuzhiyun fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(') 838*4882a593Smuzhiyun sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=') 839*4882a593Smuzhiyun 840*4882a593Smuzhiyun counthash = { } 841*4882a593Smuzhiyun linen = 1 842*4882a593Smuzhiyun for l in lines: 843*4882a593Smuzhiyun m = fre.match(l) 844*4882a593Smuzhiyun if not m: 845*4882a593Smuzhiyun m = sre.match(l) 846*4882a593Smuzhiyun if m: 847*4882a593Smuzhiyun name = m.group(1) 848*4882a593Smuzhiyun prev = counthash.get(name) 849*4882a593Smuzhiyun if not prev: 850*4882a593Smuzhiyun counthash[name] = linen 851*4882a593Smuzhiyun else: 852*4882a593Smuzhiyun self.log.error("%s:%d: Rule %s redefined. Previously defined on line %d",filename,linen,name,prev) 853*4882a593Smuzhiyun self.error = 1 854*4882a593Smuzhiyun linen += 1 855*4882a593Smuzhiyun 856*4882a593Smuzhiyun# ----------------------------------------------------------------------------- 857*4882a593Smuzhiyun# lex(module) 858*4882a593Smuzhiyun# 859*4882a593Smuzhiyun# Build all of the regular expression rules from definitions in the supplied module 860*4882a593Smuzhiyun# ----------------------------------------------------------------------------- 861*4882a593Smuzhiyundef lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0,nowarn=0,outputdir="", debuglog=None, errorlog=None): 862*4882a593Smuzhiyun global lexer 863*4882a593Smuzhiyun ldict = None 864*4882a593Smuzhiyun stateinfo = { 'INITIAL' : 'inclusive'} 865*4882a593Smuzhiyun lexobj = Lexer() 866*4882a593Smuzhiyun lexobj.lexoptimize = optimize 867*4882a593Smuzhiyun global token,input 868*4882a593Smuzhiyun 869*4882a593Smuzhiyun if errorlog is None: 870*4882a593Smuzhiyun errorlog = PlyLogger(sys.stderr) 871*4882a593Smuzhiyun 872*4882a593Smuzhiyun if debug: 873*4882a593Smuzhiyun if debuglog is None: 874*4882a593Smuzhiyun debuglog = PlyLogger(sys.stderr) 875*4882a593Smuzhiyun 876*4882a593Smuzhiyun # Get the module dictionary used for the lexer 877*4882a593Smuzhiyun if object: module = object 878*4882a593Smuzhiyun 879*4882a593Smuzhiyun if module: 880*4882a593Smuzhiyun _items = [(k,getattr(module,k)) for k in dir(module)] 881*4882a593Smuzhiyun ldict = dict(_items) 882*4882a593Smuzhiyun else: 883*4882a593Smuzhiyun ldict = get_caller_module_dict(2) 884*4882a593Smuzhiyun 885*4882a593Smuzhiyun # Collect parser information from the dictionary 886*4882a593Smuzhiyun linfo = LexerReflect(ldict,log=errorlog,reflags=reflags) 887*4882a593Smuzhiyun linfo.get_all() 888*4882a593Smuzhiyun if not optimize: 889*4882a593Smuzhiyun if linfo.validate_all(): 890*4882a593Smuzhiyun raise SyntaxError("Can't build lexer") 891*4882a593Smuzhiyun 892*4882a593Smuzhiyun if optimize and lextab: 893*4882a593Smuzhiyun try: 894*4882a593Smuzhiyun lexobj.readtab(lextab,ldict) 895*4882a593Smuzhiyun token = lexobj.token 896*4882a593Smuzhiyun input = lexobj.input 897*4882a593Smuzhiyun lexer = lexobj 898*4882a593Smuzhiyun return lexobj 899*4882a593Smuzhiyun 900*4882a593Smuzhiyun except ImportError: 901*4882a593Smuzhiyun pass 902*4882a593Smuzhiyun 903*4882a593Smuzhiyun # Dump some basic debugging information 904*4882a593Smuzhiyun if debug: 905*4882a593Smuzhiyun debuglog.info("lex: tokens = %r", linfo.tokens) 906*4882a593Smuzhiyun debuglog.info("lex: literals = %r", linfo.literals) 907*4882a593Smuzhiyun debuglog.info("lex: states = %r", linfo.stateinfo) 908*4882a593Smuzhiyun 909*4882a593Smuzhiyun # Build a dictionary of valid token names 910*4882a593Smuzhiyun lexobj.lextokens = { } 911*4882a593Smuzhiyun for n in linfo.tokens: 912*4882a593Smuzhiyun lexobj.lextokens[n] = 1 913*4882a593Smuzhiyun 914*4882a593Smuzhiyun # Get literals specification 915*4882a593Smuzhiyun if isinstance(linfo.literals,(list,tuple)): 916*4882a593Smuzhiyun lexobj.lexliterals = type(linfo.literals[0])().join(linfo.literals) 917*4882a593Smuzhiyun else: 918*4882a593Smuzhiyun lexobj.lexliterals = linfo.literals 919*4882a593Smuzhiyun 920*4882a593Smuzhiyun # Get the stateinfo dictionary 921*4882a593Smuzhiyun stateinfo = linfo.stateinfo 922*4882a593Smuzhiyun 923*4882a593Smuzhiyun regexs = { } 924*4882a593Smuzhiyun # Build the master regular expressions 925*4882a593Smuzhiyun for state in stateinfo: 926*4882a593Smuzhiyun regex_list = [] 927*4882a593Smuzhiyun 928*4882a593Smuzhiyun # Add rules defined by functions first 929*4882a593Smuzhiyun for fname, f in linfo.funcsym[state]: 930*4882a593Smuzhiyun line = func_code(f).co_firstlineno 931*4882a593Smuzhiyun file = func_code(f).co_filename 932*4882a593Smuzhiyun regex_list.append("(?P<%s>%s)" % (fname,f.__doc__)) 933*4882a593Smuzhiyun if debug: 934*4882a593Smuzhiyun debuglog.info("lex: Adding rule %s -> '%s' (state '%s')",fname,f.__doc__, state) 935*4882a593Smuzhiyun 936*4882a593Smuzhiyun # Now add all of the simple rules 937*4882a593Smuzhiyun for name,r in linfo.strsym[state]: 938*4882a593Smuzhiyun regex_list.append("(?P<%s>%s)" % (name,r)) 939*4882a593Smuzhiyun if debug: 940*4882a593Smuzhiyun debuglog.info("lex: Adding rule %s -> '%s' (state '%s')",name,r, state) 941*4882a593Smuzhiyun 942*4882a593Smuzhiyun regexs[state] = regex_list 943*4882a593Smuzhiyun 944*4882a593Smuzhiyun # Build the master regular expressions 945*4882a593Smuzhiyun 946*4882a593Smuzhiyun if debug: 947*4882a593Smuzhiyun debuglog.info("lex: ==== MASTER REGEXS FOLLOW ====") 948*4882a593Smuzhiyun 949*4882a593Smuzhiyun for state in regexs: 950*4882a593Smuzhiyun lexre, re_text, re_names = _form_master_re(regexs[state],reflags,ldict,linfo.toknames) 951*4882a593Smuzhiyun lexobj.lexstatere[state] = lexre 952*4882a593Smuzhiyun lexobj.lexstateretext[state] = re_text 953*4882a593Smuzhiyun lexobj.lexstaterenames[state] = re_names 954*4882a593Smuzhiyun if debug: 955*4882a593Smuzhiyun for i in range(len(re_text)): 956*4882a593Smuzhiyun debuglog.info("lex: state '%s' : regex[%d] = '%s'",state, i, re_text[i]) 957*4882a593Smuzhiyun 958*4882a593Smuzhiyun # For inclusive states, we need to add the regular expressions from the INITIAL state 959*4882a593Smuzhiyun for state,stype in stateinfo.items(): 960*4882a593Smuzhiyun if state != "INITIAL" and stype == 'inclusive': 961*4882a593Smuzhiyun lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL']) 962*4882a593Smuzhiyun lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL']) 963*4882a593Smuzhiyun lexobj.lexstaterenames[state].extend(lexobj.lexstaterenames['INITIAL']) 964*4882a593Smuzhiyun 965*4882a593Smuzhiyun lexobj.lexstateinfo = stateinfo 966*4882a593Smuzhiyun lexobj.lexre = lexobj.lexstatere["INITIAL"] 967*4882a593Smuzhiyun lexobj.lexretext = lexobj.lexstateretext["INITIAL"] 968*4882a593Smuzhiyun lexobj.lexreflags = reflags 969*4882a593Smuzhiyun 970*4882a593Smuzhiyun # Set up ignore variables 971*4882a593Smuzhiyun lexobj.lexstateignore = linfo.ignore 972*4882a593Smuzhiyun lexobj.lexignore = lexobj.lexstateignore.get("INITIAL","") 973*4882a593Smuzhiyun 974*4882a593Smuzhiyun # Set up error functions 975*4882a593Smuzhiyun lexobj.lexstateerrorf = linfo.errorf 976*4882a593Smuzhiyun lexobj.lexerrorf = linfo.errorf.get("INITIAL",None) 977*4882a593Smuzhiyun if not lexobj.lexerrorf: 978*4882a593Smuzhiyun errorlog.warning("No t_error rule is defined") 979*4882a593Smuzhiyun 980*4882a593Smuzhiyun # Check state information for ignore and error rules 981*4882a593Smuzhiyun for s,stype in stateinfo.items(): 982*4882a593Smuzhiyun if stype == 'exclusive': 983*4882a593Smuzhiyun if not s in linfo.errorf: 984*4882a593Smuzhiyun errorlog.warning("No error rule is defined for exclusive state '%s'", s) 985*4882a593Smuzhiyun if not s in linfo.ignore and lexobj.lexignore: 986*4882a593Smuzhiyun errorlog.warning("No ignore rule is defined for exclusive state '%s'", s) 987*4882a593Smuzhiyun elif stype == 'inclusive': 988*4882a593Smuzhiyun if not s in linfo.errorf: 989*4882a593Smuzhiyun linfo.errorf[s] = linfo.errorf.get("INITIAL",None) 990*4882a593Smuzhiyun if not s in linfo.ignore: 991*4882a593Smuzhiyun linfo.ignore[s] = linfo.ignore.get("INITIAL","") 992*4882a593Smuzhiyun 993*4882a593Smuzhiyun # Create global versions of the token() and input() functions 994*4882a593Smuzhiyun token = lexobj.token 995*4882a593Smuzhiyun input = lexobj.input 996*4882a593Smuzhiyun lexer = lexobj 997*4882a593Smuzhiyun 998*4882a593Smuzhiyun # If in optimize mode, we write the lextab 999*4882a593Smuzhiyun if lextab and optimize: 1000*4882a593Smuzhiyun lexobj.writetab(lextab,outputdir) 1001*4882a593Smuzhiyun 1002*4882a593Smuzhiyun return lexobj 1003*4882a593Smuzhiyun 1004*4882a593Smuzhiyun# ----------------------------------------------------------------------------- 1005*4882a593Smuzhiyun# runmain() 1006*4882a593Smuzhiyun# 1007*4882a593Smuzhiyun# This runs the lexer as a main program 1008*4882a593Smuzhiyun# ----------------------------------------------------------------------------- 1009*4882a593Smuzhiyun 1010*4882a593Smuzhiyundef runmain(lexer=None,data=None): 1011*4882a593Smuzhiyun if not data: 1012*4882a593Smuzhiyun try: 1013*4882a593Smuzhiyun filename = sys.argv[1] 1014*4882a593Smuzhiyun f = open(filename) 1015*4882a593Smuzhiyun data = f.read() 1016*4882a593Smuzhiyun f.close() 1017*4882a593Smuzhiyun except IndexError: 1018*4882a593Smuzhiyun sys.stdout.write("Reading from standard input (type EOF to end):\n") 1019*4882a593Smuzhiyun data = sys.stdin.read() 1020*4882a593Smuzhiyun 1021*4882a593Smuzhiyun if lexer: 1022*4882a593Smuzhiyun _input = lexer.input 1023*4882a593Smuzhiyun else: 1024*4882a593Smuzhiyun _input = input 1025*4882a593Smuzhiyun _input(data) 1026*4882a593Smuzhiyun if lexer: 1027*4882a593Smuzhiyun _token = lexer.token 1028*4882a593Smuzhiyun else: 1029*4882a593Smuzhiyun _token = token 1030*4882a593Smuzhiyun 1031*4882a593Smuzhiyun while 1: 1032*4882a593Smuzhiyun tok = _token() 1033*4882a593Smuzhiyun if not tok: break 1034*4882a593Smuzhiyun sys.stdout.write("(%s,%r,%d,%d)\n" % (tok.type, tok.value, tok.lineno,tok.lexpos)) 1035*4882a593Smuzhiyun 1036*4882a593Smuzhiyun# ----------------------------------------------------------------------------- 1037*4882a593Smuzhiyun# @TOKEN(regex) 1038*4882a593Smuzhiyun# 1039*4882a593Smuzhiyun# This decorator function can be used to set the regex expression on a function 1040*4882a593Smuzhiyun# when its docstring might need to be set in an alternative way 1041*4882a593Smuzhiyun# ----------------------------------------------------------------------------- 1042*4882a593Smuzhiyun 1043*4882a593Smuzhiyundef TOKEN(r): 1044*4882a593Smuzhiyun def set_doc(f): 1045*4882a593Smuzhiyun if hasattr(r,"__call__"): 1046*4882a593Smuzhiyun f.__doc__ = r.__doc__ 1047*4882a593Smuzhiyun else: 1048*4882a593Smuzhiyun f.__doc__ = r 1049*4882a593Smuzhiyun return f 1050*4882a593Smuzhiyun return set_doc 1051*4882a593Smuzhiyun 1052*4882a593Smuzhiyun# Alternative spelling of the TOKEN decorator 1053*4882a593SmuzhiyunToken = TOKEN 1054*4882a593Smuzhiyun 1055