一、词法单元
词法单元
:词法单元名 + 可选属性值
所有的token都在Grammar/Tokens
ENDMARKER
NAME
NUMBER
STRING
NEWLINE
INDENT
DEDENT
LPAR '('
RPAR ')'
LSQB '['
RSQB ']'
COLON ':'
COMMA ','
SEMI ';'
PLUS '+'
MINUS '-'
STAR '*'
SLASH '/'
VBAR '|'
AMPER '&'
LESS '<'
GREATER '>'
EQUAL '='
DOT '.'
PERCENT '%'
LBRACE '{'
RBRACE '}'
EQEQUAL '=='
NOTEQUAL '!='
LESSEQUAL '<='
GREATEREQUAL '>='
TILDE '~'
CIRCUMFLEX '^'
LEFTSHIFT '<<'
RIGHTSHIFT '>>'
DOUBLESTAR '**'
PLUSEQUAL '+='
MINEQUAL '-='
STAREQUAL '*='
SLASHEQUAL '/='
PERCENTEQUAL '%='
AMPEREQUAL '&='
VBAREQUAL '|='
CIRCUMFLEXEQUAL '^='
LEFTSHIFTEQUAL '<<='
RIGHTSHIFTEQUAL '>>='
DOUBLESTAREQUAL '**='
DOUBLESLASH '//'
DOUBLESLASHEQUAL '//='
AT '@'
ATEQUAL '@='
RARROW '->'
ELLIPSIS '...'
COLONEQUAL ':='
OP
AWAIT
ASYNC
TYPE_IGNORE
TYPE_COMMENT
SOFT_KEYWORD
ERRORTOKEN
# These aren't used by the C tokenizer but are needed for tokenize.py
COMMENT
NL
ENCODING
二、自动生成代码
当执行./configure时将Makefile.pre.in写入到Makefile中。
Makefile.pre.in中包含如下规则
.PHONY: regen-token
regen-token:
# Regenerate Doc/library/token-list.inc from Grammar/Tokens
# using Tools/scripts/generate_token.py
$(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_token.py rst \
$(srcdir)/Grammar/Tokens \
$(srcdir)/Doc/library/token-list.inc
# Regenerate Include/token.h from Grammar/Tokens
# using Tools/scripts/generate_token.py
$(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_token.py h \
$(srcdir)/Grammar/Tokens \
$(srcdir)/Include/token.h
# Regenerate Parser/token.c from Grammar/Tokens
# using Tools/scripts/generate_token.py
$(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_token.py c \
$(srcdir)/Grammar/Tokens \
$(srcdir)/Parser/token.c
# Regenerate Lib/token.py from Grammar/Tokens
# using Tools/scripts/generate_token.py
$(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_token.py py \
$(srcdir)/Grammar/Tokens \
$(srcdir)/Lib/token.py
将调用Tools/scripts/generate_token.py生成token相关代码。
2.1 Include/token.h
/* Auto-generated by Tools/scripts/generate_token.py */
/* Token types */
#ifndef Py_LIMITED_API
#ifndef Py_TOKEN_H
#define Py_TOKEN_H
#ifdef __cplusplus
extern "C" {
#endif
#undef TILDE /* Prevent clash of our definition with system macro. Ex AIX, ioctl.h */
#define ENDMARKER 0
#define NAME 1
#define NUMBER 2
#define STRING 3
#define NEWLINE 4
#define INDENT 5
#define DEDENT 6
#define LPAR 7
#define RPAR 8
#define LSQB 9
#define RSQB 10
#define COLON 11
#define COMMA 12
#define SEMI 13
#define PLUS 14
#define MINUS 15
#define STAR 16
#define SLASH 17
#define VBAR 18
#define AMPER 19
#define LESS 20
#define GREATER 21
#define EQUAL 22
#define DOT 23
#define PERCENT 24
#define LBRACE 25
#define RBRACE 26
#define EQEQUAL 27
#define NOTEQUAL 28
#define LESSEQUAL 29
#define GREATEREQUAL 30
#define TILDE 31
#define CIRCUMFLEX 32
#define LEFTSHIFT 33
#define RIGHTSHIFT 34
#define DOUBLESTAR 35
#define PLUSEQUAL 36
#define MINEQUAL 37
#define STAREQUAL 38
#define SLASHEQUAL 39
#define PERCENTEQUAL 40
#define AMPEREQUAL 41
#define VBAREQUAL 42
#define CIRCUMFLEXEQUAL 43
#define LEFTSHIFTEQUAL 44
#define RIGHTSHIFTEQUAL 45
#define DOUBLESTAREQUAL 46
#define DOUBLESLASH 47
#define DOUBLESLASHEQUAL 48
#define AT 49
#define ATEQUAL 50
#define RARROW 51
#define ELLIPSIS 52
#define COLONEQUAL 53
#define OP 54
#define AWAIT 55
#define ASYNC 56
#define TYPE_IGNORE 57
#define TYPE_COMMENT 58
#define SOFT_KEYWORD 59
#define ERRORTOKEN 60
#define N_TOKENS 64
#define NT_OFFSET 256
/* Special definitions for cooperation with parser */
#define ISTERMINAL(x) ((x) < NT_OFFSET)
#define ISNONTERMINAL(x) ((x) >= NT_OFFSET)
#define ISEOF(x) ((x) == ENDMARKER)
#define ISWHITESPACE(x) ((x) == ENDMARKER || \
(x) == NEWLINE || \
(x) == INDENT || \
(x) == DEDENT)
PyAPI_DATA(const char * const) _PyParser_TokenNames[]; /* Token names */
PyAPI_FUNC(int) PyToken_OneChar(int);
PyAPI_FUNC(int) PyToken_TwoChars(int, int);
PyAPI_FUNC(int) PyToken_ThreeChars(int, int, int);
#ifdef __cplusplus
}
#endif
#endif /* !Py_TOKEN_H */
#endif /* Py_LIMITED_API */
2.2 Parser/token.c
/* Auto-generated by Tools/scripts/generate_token.py */
#include "Python.h"
#include "token.h"
/* Token names */
const char * const _PyParser_TokenNames[] = {
"ENDMARKER",
"NAME",
"NUMBER",
"STRING",
"NEWLINE",
"INDENT",
"DEDENT",
"LPAR",
"RPAR",
"LSQB",
"RSQB",
"COLON",
"COMMA",
"SEMI",
"PLUS",
"MINUS",
"STAR",
"SLASH",
"VBAR",
"AMPER",
"LESS",
"GREATER",
"EQUAL",
"DOT",
"PERCENT",
"LBRACE",
"RBRACE",
"EQEQUAL",
"NOTEQUAL",
"LESSEQUAL",
"GREATEREQUAL",
"TILDE",
"CIRCUMFLEX",
"LEFTSHIFT",
"RIGHTSHIFT",
"DOUBLESTAR",
"PLUSEQUAL",
"MINEQUAL",
"STAREQUAL",
"SLASHEQUAL",
"PERCENTEQUAL",
"AMPEREQUAL",
"VBAREQUAL",
"CIRCUMFLEXEQUAL",
"LEFTSHIFTEQUAL",
"RIGHTSHIFTEQUAL",
"DOUBLESTAREQUAL",
"DOUBLESLASH",
"DOUBLESLASHEQUAL",
"AT",
"ATEQUAL",
"RARROW",
"ELLIPSIS",
"COLONEQUAL",
"OP",
"AWAIT",
"ASYNC",
"TYPE_IGNORE",
"TYPE_COMMENT",
"SOFT_KEYWORD",
"<ERRORTOKEN>",
"<COMMENT>",
"<NL>",
"<ENCODING>",
"<N_TOKENS>",
};
/* Return the token corresponding to a single character */
int
PyToken_OneChar(int c1)
{
switch (c1) {
case '%': return PERCENT;
case '&': return AMPER;
case '(': return LPAR;
case ')': return RPAR;
case '*': return STAR;
case '+': return PLUS;
case ',': return COMMA;
case '-': return MINUS;
case '.': return DOT;
case '/': return SLASH;
case ':': return COLON;
case ';': return SEMI;
case '<': return LESS;
case '=': return EQUAL;
case '>': return GREATER;
case '@': return AT;
case '[': return LSQB;
case ']': return RSQB;
case '^': return CIRCUMFLEX;
case '{': return LBRACE;
case '|': return VBAR;
case '}': return RBRACE;
case '~': return TILDE;
}
return OP;
}
int
PyToken_TwoChars(int c1, int c2)
{
switch (c1) {
case '!':
switch (c2) {
case '=': return NOTEQUAL;
}
break;
case '%':
switch (c2) {
case '=': return PERCENTEQUAL;
}
break;
case '&':
switch (c2) {
case '=': return AMPEREQUAL;
}
break;
case '*':
switch (c2) {
case '*': return DOUBLESTAR;
case '=': return STAREQUAL;
}
break;
case '+':
switch (c2) {
case '=': return PLUSEQUAL;
}
break;
case '-':
switch (c2) {
case '=': return MINEQUAL;
case '>': return RARROW;
}
break;
case '/':
switch (c2) {
case '/': return DOUBLESLASH;
case '=': return SLASHEQUAL;
}
break;
case ':':
switch (c2) {
case '=': return COLONEQUAL;
}
break;
case '<':
switch (c2) {
case '<': return LEFTSHIFT;
case '=': return LESSEQUAL;
case '>': return NOTEQUAL;
}
break;
case '=':
switch (c2) {
case '=': return EQEQUAL;
}
break;
case '>':
switch (c2) {
case '=': return GREATEREQUAL;
case '>': return RIGHTSHIFT;
}
break;
case '@':
switch (c2) {
case '=': return ATEQUAL;
}
break;
case '^':
switch (c2) {
case '=': return CIRCUMFLEXEQUAL;
}
break;
case '|':
switch (c2) {
case '=': return VBAREQUAL;
}
break;
}
return OP;
}
int
PyToken_ThreeChars(int c1, int c2, int c3)
{
switch (c1) {
case '*':
switch (c2) {
case '*':
switch (c3) {
case '=': return DOUBLESTAREQUAL;
}
break;
}
break;
case '.':
switch (c2) {
case '.':
switch (c3) {
case '.': return ELLIPSIS;
}
break;
}
break;
case '/':
switch (c2) {
case '/':
switch (c3) {
case '=': return DOUBLESLASHEQUAL;
}
break;
}
break;
case '<':
switch (c2) {
case '<':
switch (c3) {
case '=': return LEFTSHIFTEQUAL;
}
break;
}
break;
case '>':
switch (c2) {
case '>':
switch (c3) {
case '=': return RIGHTSHIFTEQUAL;
}
break;
}
break;
}
return OP;
}