例子如下
$ cat hello.py
def hello():
print("hello world")
if __name__ == "__main__":
hello()
一、token状态结构体
源码字符流 ==> 词法分析程序 ==> token流
python3.10.2 词法分析主要 Parser/tokenizer.c
主要的结构体struct tok_state
/* Tokenizer state */
struct tok_state {
/* Input state; buf <= cur <= inp <= end */
/* NB an entire line is held in the buffer */
char *buf; /* Input buffer, or NULL; malloc'ed if fp != NULL */
char *cur; /* Next character in buffer */
char *inp; /* End of data in buffer */
int fp_interactive; /* If the file descriptor is interactive */
char *interactive_src_start; /* The start of the source parsed so far in interactive mode */
char *interactive_src_end; /* The end of the source parsed so far in interactive mode */
const char *end; /* End of input buffer if buf != NULL */
const char *start; /* Start of current token if not NULL */
int done; /* E_OK normally, E_EOF at EOF, otherwise error code */
/* NB If done != E_OK, cur must be == inp!!! */
FILE *fp; /* Rest of input; NULL if tokenizing a string */
int tabsize; /* Tab spacing */
int indent; /* Current indentation index */
int indstack[MAXINDENT]; /* Stack of indents */
int atbol; /* Nonzero if at begin of new line */
int pendin; /* Pending indents (if > 0) or dedents (if < 0) */
const char *prompt, *nextprompt; /* For interactive prompting */
int lineno; /* Current line number */
int first_lineno; /* First line of a single line or multi line string
expression (cf. issue 16806) */
int level; /* () [] {} Parentheses nesting level */
/* Used to allow free continuations inside them */
char parenstack[MAXLEVEL];
int parenlinenostack[MAXLEVEL];
int parencolstack[MAXLEVEL];
PyObject *filename;
/* Stuff for checking on different tab sizes */
int altindstack[MAXINDENT]; /* Stack of alternate indents */
/* Stuff for PEP 0263 */
enum decoding_state decoding_state;
int decoding_erred; /* whether erred in decoding */
char *encoding; /* Source encoding. */
int cont_line; /* whether we are in a continuation line. */
const char* line_start; /* pointer to start of current line */
const char* multi_line_start; /* pointer to start of first line of
a single line or multi line string
expression (cf. issue 16806) */
PyObject *decoding_readline; /* open(...).readline */
PyObject *decoding_buffer;
const char* enc; /* Encoding for the current str. */
char* str; /* Source string being tokenized (if tokenizing from a string)*/
char* input; /* Tokenizer's newline translated copy of the string. */
int type_comments; /* Whether to look for type comments */
/* async/await related fields (still needed depending on feature_version) */
int async_hacks; /* =1 if async/await aren't always keywords */
int async_def; /* =1 if tokens are inside an 'async def' body. */
int async_def_indent; /* Indentation level of the outermost 'async def'. */
int async_def_nl; /* =1 if the outermost 'async def' had at least one
NEWLINE token after it. */
/* How to proceed when asked for a new token in interactive mode */
enum interactive_underflow_t interactive_underflow;
};
二、初始化
/* Create and initialize a new tok_state structure */
static struct tok_state *
tok_new(void)
{
struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
sizeof(struct tok_state));
if (tok == NULL)
return NULL;
tok->buf = tok->cur = tok->inp = NULL;
tok->fp_interactive = 0;
tok->interactive_src_start = NULL;
tok->interactive_src_end = NULL;
tok->start = NULL;
tok->end = NULL;
tok->done = E_OK;
tok->fp = NULL;
tok->input = NULL;
tok->tabsize = TABSIZE;
tok->indent = 0;
tok->indstack[0] = 0;
tok->atbol = 1;
tok->pendin = 0;
tok->prompt = tok->nextprompt = NULL;
tok->lineno = 0;
tok->level = 0;
tok->altindstack[0] = 0;
tok->decoding_state = STATE_INIT;
tok->decoding_erred = 0;
tok->enc = NULL;
tok->encoding = NULL;
tok->cont_line = 0;
tok->filename = NULL;
tok->decoding_readline = NULL;
tok->decoding_buffer = NULL;
tok->type_comments = 0;
tok->async_hacks = 0;
tok->async_def = 0;
tok->async_def_indent = 0;
tok->async_def_nl = 0;
tok->interactive_underflow = IUNDERFLOW_NORMAL;
tok->str = NULL;
return tok;
}
struct tok_state *
PyTokenizer_FromFile(FILE *fp, const char* enc,
const char *ps1, const char *ps2)
{
struct tok_state *tok = tok_new();
if (tok == NULL)
return NULL;
if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
PyTokenizer_Free(tok);
return NULL;
}
tok->cur = tok->inp = tok->buf;
tok->end = tok->buf + BUFSIZ;
tok->fp = fp;
tok->prompt = ps1;
tok->nextprompt = ps2;
if (enc != NULL) {
/* Must copy encoding declaration since it
gets copied into the parse tree. */
tok->encoding = new_string(enc, strlen(enc), tok);
if (!tok->encoding) {
PyTokenizer_Free(tok);
return NULL;
}
tok->decoding_state = STATE_NORMAL;
}
return tok;
}
初始化后,创建了一个8K的缓冲区,用于读取源代码进行处理
二、缓冲区
词法分析入口函数
int
PyTokenizer_Get(struct tok_state *tok, const char **p_start, const char **p_end)
{
int result = tok_get(tok, p_start, p_end);
if (tok->decoding_erred) {
result = ERRORTOKEN;
tok->done = E_DECODE;
}
return result;
}
其中tok_get
函数是真正进行词法分析的处理函数。
调用tok_nextc
获取一个一个的字符进行识别出不同的token。
static int
tok_nextc(struct tok_state *tok)
{
int rc;
for (;;) {
if (tok->cur != tok->inp) {//第一次不会进来,buf中还没有数据,一会将读取部分数据到缓冲区中,后续就可以从buf中直接读取
return Py_CHARMASK(*tok->cur++); /* Fast path */
}
if (tok->done != E_OK)
return EOF;
if (tok->fp == NULL) { //单纯的字符串,直接读取字符串中的字符
rc = tok_underflow_string(tok);
}
else if (tok->prompt != NULL) { //交互模式下,直接从终端获取的字符串中获取字符
rc = tok_underflow_interactive(tok);
}
else { //从文件中读取
rc = tok_underflow_file(tok);
}
...
if (!rc) {//读取结束
tok->cur = tok->inp;
return EOF;
}
tok->line_start = tok->cur;
}
Py_UNREACHABLE();
}
2.1 从文件中读取一行到缓冲区中
static int
tok_underflow_file(struct tok_state *tok) {
if (tok->start == NULL) {
tok->cur = tok->inp = tok->buf;
}
if (tok->decoding_state == STATE_INIT) {
/* We have not yet determined the encoding.
If an encoding is found, use the file-pointer
reader functions from now on. */
if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {
error_ret(tok);
return 0;
}
assert(tok->decoding_state != STATE_INIT);
}
/* Read until '\n' or EOF */
if (tok->decoding_readline != NULL) {
/* We already have a codec associated with this input. */
if (!tok_readline_recode(tok)) {
return 0;
}
}
else {
/* We want a 'raw' read. */
if (!tok_readline_raw(tok)) {
return 0;
}
}
if (tok->inp == tok->cur) {
tok->done = E_EOF;
return 0;
}
if (tok->inp[-1] != '\n') {
/* Last line does not end in \n, fake one */
*tok->inp++ = '\n';
*tok->inp = '\0';
}
tok->lineno++;
if (tok->decoding_state != STATE_NORMAL) {
if (tok->lineno > 2) {
tok->decoding_state = STATE_NORMAL;
}
else if (!check_coding_spec(tok->cur, strlen(tok->cur),
tok, fp_setreadl))
{
return 0;
}
}
/* The default encoding is UTF-8, so make sure we don't have any
non-UTF-8 sequences in it. */
if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
error_ret(tok);
return 0;
}
assert(tok->done == E_OK);
return tok->done == E_OK;
}
2.1.1 检查文件是否为BOM编码
因为tok->decoding_state == STATE_INIT
将进入check_bom
函数进行检查是否为bom格式,如果开头是0xEF 0xBB 0x0xBF,则将使用utf-8编码进行转码。解析状态进入STATE_SEEK_CODING状态。
/* See whether the file starts with a BOM. If it does,
invoke the set_readline function with the new encoding.
Return 1 on success, 0 on failure. */
static int
check_bom(int get_char(struct tok_state *),
void unget_char(int, struct tok_state *),
int set_readline(struct tok_state *, const char *),
struct tok_state *tok)
{
int ch1, ch2, ch3;
ch1 = get_char(tok);
tok->decoding_state = STATE_SEEK_CODING;
if (ch1 == EOF) {
return 1;
} else if (ch1 == 0xEF) {
ch2 = get_char(tok);
if (ch2 != 0xBB) {
unget_char(ch2, tok);
unget_char(ch1, tok);
return 1;
}
ch3 = get_char(tok);
if (ch3 != 0xBF) {
unget_char(ch3, tok);
unget_char(ch2, tok);
unget_char(ch1, tok);
return 1;
}
} else {
unget_char(ch1, tok);
return 1;
}
if (tok->encoding != NULL)
PyMem_Free(tok->encoding);
tok->encoding = new_string("utf-8", 5, tok);
if (!tok->encoding)
return 0;
/* No need to set_readline: input is already utf-8 */
return 1;
}
2.1.2 读取一行数据
tok_readline_raw
static int
tok_readline_raw(struct tok_state *tok)
{
do {
//扩容,如果需要
if (!tok_reserve_buf(tok, BUFSIZ)) {
return 0;
}
//读取一行数据
char *line = Py_UniversalNewlineFgets(tok->inp,
(int)(tok->end - tok->inp),
tok->fp, NULL);
if (line == NULL) {
return 1;
}
if (tok->fp_interactive &&
tok_concatenate_interactive_new_line(tok, line) == -1) {
return 0;
}
tok->inp = strchr(tok->inp, '\0');
if (tok->inp == tok->buf) {
return 0;
}
} while (tok->inp[-1] != '\n');
return 1;
}
Py_UniversalNewlineFgets
函数读取一行数据,并且将\r或\r\n替换为\n
2.1.3 最后一行追加\n
文件最后一行可能没有\n,如果没有则需要添加
static int
tok_underflow_file(struct tok_state *tok) {
...
if (tok->inp[-1] != '\n') {
/* Last line does not end in \n, fake one */
*tok->inp++ = '\n';
*tok->inp = '\0';
}
...
}
2.1.4 检查具体编码
从文件开头两行中检查编码,现在decoding_state的状态为STATE_SEEK_CODING, 并且lineno为1, 所以将进入到具体的编码检测中。
static int
tok_underflow_file(struct tok_state *tok) {
...
tok->lineno++;
if (tok->decoding_state != STATE_NORMAL) {
if (tok->lineno > 2) {
tok->decoding_state = STATE_NORMAL;
}
else if (!check_coding_spec(tok->cur, strlen(tok->cur),
tok, fp_setreadl))
{
return 0;
}
}
...
}
/* Check whether the line contains a coding spec. If it does,
invoke the set_readline function for the new encoding.
This function receives the tok_state and the new encoding.
Return 1 on success, 0 on failure. */
static int
check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
int set_readline(struct tok_state *, const char *))
{
char *cs;
if (tok->cont_line) {
/* It's a continuation line, so it can't be a coding spec. */
tok->decoding_state = STATE_NORMAL;
return 1;
}
if (!get_coding_spec(line, &cs, size, tok)) {
return 0;
}
if (!cs) {
Py_ssize_t i;
for (i = 0; i < size; i++) {
if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
break;
if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
/* Stop checking coding spec after a line containing
* anything except a comment. */
tok->decoding_state = STATE_NORMAL;
break;
}
}
return 1;
}
tok->decoding_state = STATE_NORMAL;
if (tok->encoding == NULL) {
assert(tok->decoding_readline == NULL);
if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
error_ret(tok);
PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
PyMem_Free(cs);
return 0;
}
tok->encoding = cs;
} else { /* then, compare cs with BOM */
if (strcmp(tok->encoding, cs) != 0) {
error_ret(tok);
PyErr_Format(PyExc_SyntaxError,
"encoding problem: %s with BOM", cs);
PyMem_Free(cs);
return 0;
}
PyMem_Free(cs);
}
return 1;
}
其中函数get_coding_spec
从单行的注释中搜索coding:xxx 或者coding=xxx
decoding_state进入STATE_NORMAL状态
如果找到编码方式xxx, 并且不等于utf-8,则调用回调函数set_readline设置编码
2.1.5 判断默认编码方式下读取的数据是否是合法的utf8
因为hello.py中第一行或者第二行中并未指定编码方式,所以默认是utf8编码,并且不是BOM编码的文件,所以tok->encoding=NULL。
static int
tok_underflow_file(struct tok_state *tok) {
...
/* The default encoding is UTF-8, so make sure we don't have any
non-UTF-8 sequences in it. */
if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
error_ret(tok);
return 0;
}
...
}
static int
ensure_utf8(char *line, struct tok_state *tok)
{
int badchar = 0;
unsigned char *c;
int length;
for (c = (unsigned char *)line; *c; c += length) {
if (!(length = valid_utf8(c))) {
badchar = *c;
break;
}
}
if (badchar) {
/* Need to add 1 to the line number, since this line
has not been counted, yet. */
PyErr_Format(PyExc_SyntaxError,
"Non-UTF-8 code starting with '\\x%.2x' "
"in file %U on line %i, "
"but no encoding declared; "
"see https://python.org/dev/peps/pep-0263/ for details",
badchar, tok->filename, tok->lineno + 1);
return 0;
}
return 1;
}
/* Check whether the characters at s start a valid
UTF-8 sequence. Return the number of characters forming
the sequence if yes, 0 if not. */
static int valid_utf8(const unsigned char* s)
{
int expected = 0;
int length;
if (*s < 0x80)
/* single-byte code */
return 1;
if (*s < 0xc0)
/* following byte */
return 0;
if (*s < 0xE0)
expected = 1;
else if (*s < 0xF0)
expected = 2;
else if (*s < 0xF8)
expected = 3;
else
return 0;
length = expected + 1;
for (; expected; expected--)
if (s[expected] < 0x80 || s[expected] >= 0xC0)
return 0;
return length;
}
当一行读取到缓冲区buf中后,结构图如下
2.2 获取一个字符
现在tok->cur != tok->inp成立,直接取出当前字符。
static int
tok_nextc(struct tok_state *tok)
{
int rc;
for (;;) {
if (tok->cur != tok->inp) {
return Py_CHARMASK(*tok->cur++); /* Fast path */
}
...
}
...
}
/* Argument must be a char or an int in [-128, 127] or [0, 255]. */
#define Py_CHARMASK(c) ((unsigned char)((c) & 0xff))
比如现在取出字符’d’,‘e’,'f’后的结构