How to check if a string is a valid python identifier? including keyword check? How to check if a string is a valid python identifier? including keyword check? python python

How to check if a string is a valid python identifier? including keyword check?


Python 3

Python 3 now has 'foo'.isidentifier(), so that seems to be the best solution for recent Python versions (thanks fellow runciter@freenode for suggestion). However, somewhat counter-intuitively, it does not check against the list of keywords, so combination of both must be used:

import keyworddef isidentifier(ident: str) -> bool:    """Determines if string is valid Python identifier."""    if not isinstance(ident, str):        raise TypeError("expected str, but got {!r}".format(type(ident)))    if not ident.isidentifier():        return False    if keyword.iskeyword(ident):        return False    return True

Python 2

For Python 2, easiest possible way to check if given string is valid Python identifier is to let Python parse it itself.

There are two possible approaches. Fastest is to use ast, and check if AST of single expression is of desired shape:

import astdef isidentifier(ident):    """Determines, if string is valid Python identifier."""    # Smoke test — if it's not string, then it's not identifier, but we don't    # want to just silence exception. It's better to fail fast.    if not isinstance(ident, str):        raise TypeError("expected str, but got {!r}".format(type(ident)))    # Resulting AST of simple identifier is <Module [<Expr <Name "foo">>]>    try:        root = ast.parse(ident)    except SyntaxError:        return False    if not isinstance(root, ast.Module):        return False    if len(root.body) != 1:        return False    if not isinstance(root.body[0], ast.Expr):        return False    if not isinstance(root.body[0].value, ast.Name):        return False    if root.body[0].value.id != ident:        return False    return True

Another is to let tokenize module split the identifier into the stream of tokens, and check it only contains our name:

import keywordimport tokenizedef isidentifier(ident):    """Determines if string is valid Python identifier."""    # Smoke test - if it's not string, then it's not identifier, but we don't    # want to just silence exception. It's better to fail fast.    if not isinstance(ident, str):        raise TypeError("expected str, but got {!r}".format(type(ident)))    # Quick test - if string is in keyword list, it's definitely not an ident.    if keyword.iskeyword(ident):        return False    readline = lambda g=(lambda: (yield ident))(): next(g)    tokens = list(tokenize.generate_tokens(readline))    # You should get exactly 2 tokens    if len(tokens) != 2:        return False    # First is NAME, identifier.    if tokens[0][0] != tokenize.NAME:        return False    # Name should span all the string, so there would be no whitespace.    if ident != tokens[0][1]:        return False    # Second is ENDMARKER, ending stream    if tokens[1][0] != tokenize.ENDMARKER:        return False    return True

The same function, but compatible with Python 3, looks like this:

import keywordimport tokenizedef isidentifier_py3(ident):    """Determines if string is valid Python identifier."""    # Smoke test — if it's not string, then it's not identifier, but we don't    # want to just silence exception. It's better to fail fast.    if not isinstance(ident, str):        raise TypeError("expected str, but got {!r}".format(type(ident)))    # Quick test — if string is in keyword list, it's definitely not an ident.    if keyword.iskeyword(ident):        return False    readline = lambda g=(lambda: (yield ident.encode('utf-8-sig')))(): next(g)    tokens = list(tokenize.tokenize(readline))    # You should get exactly 3 tokens    if len(tokens) != 3:        return False    # If using Python 3, first one is ENCODING, it's always utf-8 because     # we explicitly passed in UTF-8 BOM with ident.    if tokens[0].type != tokenize.ENCODING:        return False    # Second is NAME, identifier.    if tokens[1].type != tokenize.NAME:        return False    # Name should span all the string, so there would be no whitespace.    if ident != tokens[1].string:        return False    # Third is ENDMARKER, ending stream    if tokens[2].type != tokenize.ENDMARKER:        return False    return True

However, be aware of bugs in Python 3 tokenize implementation that reject some completely valid identifiers like ℘᧚, and 贈ᩭ. ast works fine though. Generally, I'd advise against using tokenize-based implemetation for actual checks.

Also, some may consider heavy machinery like AST parser to be a tad overkill. This simple implementation is self-contained and guaranteed to work on any Python 2:

import keywordimport stringdef isidentifier(ident):    """Determines if string is valid Python identifier."""    if not isinstance(ident, str):        raise TypeError("expected str, but got {!r}".format(type(ident)))    if not ident:        return False    if keyword.iskeyword(ident):        return False    first = '_' + string.lowercase + string.uppercase    if ident[0] not in first:        return False    other = first + string.digits    for ch in ident[1:]:        if ch not in other:            return False    return True

Here are few tests to check these all work:

assert(isidentifier('foo'))assert(isidentifier('foo1_23'))assert(not isidentifier('pass'))    # syntactically correct keywordassert(not isidentifier('foo '))    # trailing whitespaceassert(not isidentifier(' foo'))    # leading whitespaceassert(not isidentifier('1234'))    # numberassert(not isidentifier('1234abc')) # number and lettersassert(not isidentifier('👻'))      # Unicode not from allowed rangeassert(not isidentifier(''))        # empty stringassert(not isidentifier('   '))     # whitespace onlyassert(not isidentifier('foo bar')) # several tokensassert(not isidentifier('no-dashed-names-for-you')) # no such thing in Python# Unicode identifiers are only allowed in Python 3:assert(isidentifier('℘᧚')) # Unicode $Other_ID_Start and $Other_ID_Continue

Performance

All measurements has been conducted on my machine (MBPr Mid 2014) on the same randomly generated test set of 1 500 000 elements, 1000 000 valid and 500 000 invalid. YMMV

== Python 3:method | calls/sec | faster---------------------------token  |    48 286 |  1.00xast    |   175 530 |  3.64xnative | 1 924 680 | 39.86x== Python 2:method | calls/sec | faster---------------------------token  |    83 994 |  1.00xast    |   208 206 |  2.48xsimple | 1 066 461 | 12.70x


The keyword module contains the list of all reserved keywords:

>>> import keyword>>> keyword.iskeyword("in")True>>> keyword.kwlist['and', 'as', 'assert', 'break', 'class', 'continue', 'def', 'del', 'elif', 'else', 'except', 'exec', 'finally', 'for', 'from', 'global', 'if', 'import', 'in', 'is', 'lambda', 'not', 'or', 'pass', 'print', 'raise', 'return', 'try', 'while', 'with', 'yield']

Note that this list will be different depending on what major version of Python you are using, as the list of keywords changes (especially between Python 2 and Python 3).

If you also want all builtin names, use __builtins__

>>> dir(__builtins__)['ArithmeticError', 'AssertionError', 'AttributeError', 'BaseException', 'BlockingIOError', 'BrokenPipeError', 'BufferError', 'BytesWarning', 'ChildProcessError', 'ConnectionAbortedError', 'ConnectionError', 'ConnectionRefusedError', 'ConnectionResetError', 'DeprecationWarning', 'EOFError', 'Ellipsis', 'EnvironmentError', 'Exception', 'False', 'FileExistsError', 'FileNotFoundError', 'FloatingPointError', 'FutureWarning', 'GeneratorExit', 'IOError', 'ImportError', 'ImportWarning', 'IndentationError', 'IndexError', 'InterruptedError', 'IsADirectoryError', 'KeyError', 'KeyboardInterrupt', 'LookupError', 'MemoryError', 'NameError', 'None', 'NotADirectoryError', 'NotImplemented', 'NotImplementedError', 'OSError', 'OverflowError', 'PendingDeprecationWarning', 'PermissionError', 'ProcessLookupError', 'ReferenceError', 'ResourceWarning', 'RuntimeError', 'RuntimeWarning', 'StopIteration', 'SyntaxError', 'SyntaxWarning', 'SystemError', 'SystemExit', 'TabError', 'TimeoutError', 'True', 'TypeError', 'UnboundLocalError', 'UnicodeDecodeError', 'UnicodeEncodeError', 'UnicodeError', 'UnicodeTranslateError', 'UnicodeWarning', 'UserWarning', 'ValueError', 'Warning', 'ZeroDivisionError', '_', '__build_class__', '__debug__', '__doc__', '__import__', '__name__', '__package__', 'abs', 'all', 'any', 'ascii', 'bin', 'bool', 'bytearray', 'bytes', 'callable', 'chr', 'classmethod', 'compile', 'complex', 'copyright', 'credits', 'delattr', 'dict', 'dir', 'divmod', 'enumerate', 'eval', 'exec', 'exit', 'filter', 'float', 'format', 'frozenset', 'getattr', 'globals', 'hasattr', 'hash', 'help', 'hex', 'id', 'input', 'int', 'isinstance', 'issubclass', 'iter', 'len', 'license', 'list', 'locals', 'map', 'max', 'memoryview', 'min', 'next', 'object', 'oct', 'open', 'ord', 'pow', 'print', 'property', 'quit', 'range', 'repr', 'reversed', 'round', 'set', 'setattr', 'slice', 'sorted', 'staticmethod', 'str', 'sum', 'super', 'tuple', 'type', 'vars', 'zip']

And note that some of these (like copyright) are not really that big of a deal to override.

One more caveat: note that in Python 2, True, False, and None are not considered keywords. However, assigning to None is a SyntaxError. Assigning to True or False is allowed, though not recommended (same with any other builtin). In Python 3, they are keywords, so this is not an issue.


John: as a slight improvement, I added a $ in the re, otherwise, the test does not detect spaces:

import keyword import remy_var = "$testBadVar"print re.match("[_A-Za-z][_a-zA-Z0-9]*$",my_var) and not keyword.iskeyword(my_var)