例子如下

$ cat hello.py 
def hello():
    print("hello world")

if __name__ == "__main__":
    hello()

一、token状态结构体

源码字符流 ==> 词法分析程序 ==> token流
python3.10.2 词法分析主要 Parser/tokenizer.c

主要的结构体struct tok_state

/* Tokenizer state */
struct tok_state {
    /* Input state; buf <= cur <= inp <= end */
    /* NB an entire line is held in the buffer */
    char *buf;          /* Input buffer, or NULL; malloc'ed if fp != NULL */
    char *cur;          /* Next character in buffer */
    char *inp;          /* End of data in buffer */
    int fp_interactive; /* If the file descriptor is interactive */
    char *interactive_src_start; /* The start of the source parsed so far in interactive mode */
    char *interactive_src_end; /* The end of the source parsed so far in interactive mode */
    const char *end;    /* End of input buffer if buf != NULL */
    const char *start;  /* Start of current token if not NULL */
    int done;           /* E_OK normally, E_EOF at EOF, otherwise error code */
    /* NB If done != E_OK, cur must be == inp!!! */
    FILE *fp;           /* Rest of input; NULL if tokenizing a string */
    int tabsize;        /* Tab spacing */
    int indent;         /* Current indentation index */
    int indstack[MAXINDENT];            /* Stack of indents */
    int atbol;          /* Nonzero if at begin of new line */
    int pendin;         /* Pending indents (if > 0) or dedents (if < 0) */
    const char *prompt, *nextprompt;          /* For interactive prompting */
    int lineno;         /* Current line number */
    int first_lineno;   /* First line of a single line or multi line string
                           expression (cf. issue 16806) */
    int level;          /* () [] {} Parentheses nesting level */
            /* Used to allow free continuations inside them */
    char parenstack[MAXLEVEL];
    int parenlinenostack[MAXLEVEL];
    int parencolstack[MAXLEVEL];
    PyObject *filename;
    /* Stuff for checking on different tab sizes */
    int altindstack[MAXINDENT];         /* Stack of alternate indents */
    /* Stuff for PEP 0263 */
    enum decoding_state decoding_state;
    int decoding_erred;         /* whether erred in decoding  */
    char *encoding;         /* Source encoding. */
    int cont_line;          /* whether we are in a continuation line. */
    const char* line_start;     /* pointer to start of current line */
    const char* multi_line_start; /* pointer to start of first line of
                                     a single line or multi line string
                                     expression (cf. issue 16806) */
    PyObject *decoding_readline; /* open(...).readline */
    PyObject *decoding_buffer;
    const char* enc;        /* Encoding for the current str. */
    char* str;          /* Source string being tokenized (if tokenizing from a string)*/
    char* input;       /* Tokenizer's newline translated copy of the string. */

    int type_comments;      /* Whether to look for type comments */

    /* async/await related fields (still needed depending on feature_version) */
    int async_hacks;     /* =1 if async/await aren't always keywords */
    int async_def;        /* =1 if tokens are inside an 'async def' body. */
    int async_def_indent; /* Indentation level of the outermost 'async def'. */
    int async_def_nl;     /* =1 if the outermost 'async def' had at least one
                             NEWLINE token after it. */
    /* How to proceed when asked for a new token in interactive mode */
    enum interactive_underflow_t interactive_underflow;
};

二、初始化

/* Create and initialize a new tok_state structure */

static struct tok_state *
tok_new(void)
{
    struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
                                            sizeof(struct tok_state));
    if (tok == NULL)
        return NULL;
    tok->buf = tok->cur = tok->inp = NULL;
    tok->fp_interactive = 0;
    tok->interactive_src_start = NULL;
    tok->interactive_src_end = NULL;
    tok->start = NULL;
    tok->end = NULL;
    tok->done = E_OK;
    tok->fp = NULL;
    tok->input = NULL;
    tok->tabsize = TABSIZE;
    tok->indent = 0;
    tok->indstack[0] = 0;
    tok->atbol = 1;
    tok->pendin = 0;
    tok->prompt = tok->nextprompt = NULL;
    tok->lineno = 0;
    tok->level = 0;
    tok->altindstack[0] = 0;
    tok->decoding_state = STATE_INIT;
    tok->decoding_erred = 0;
    tok->enc = NULL;
    tok->encoding = NULL;
    tok->cont_line = 0;
    tok->filename = NULL;
    tok->decoding_readline = NULL;
    tok->decoding_buffer = NULL;
    tok->type_comments = 0;
    tok->async_hacks = 0;
    tok->async_def = 0;
    tok->async_def_indent = 0;
    tok->async_def_nl = 0;
    tok->interactive_underflow = IUNDERFLOW_NORMAL;
    tok->str = NULL;
    return tok;
}

请添加图片描述

struct tok_state *
PyTokenizer_FromFile(FILE *fp, const char* enc,
                     const char *ps1, const char *ps2)
{
    struct tok_state *tok = tok_new();
    if (tok == NULL)
        return NULL;
    if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
        PyTokenizer_Free(tok);
        return NULL;
    }
    tok->cur = tok->inp = tok->buf;
    tok->end = tok->buf + BUFSIZ;
    tok->fp = fp;
    tok->prompt = ps1;
    tok->nextprompt = ps2;
    if (enc != NULL) {
        /* Must copy encoding declaration since it
           gets copied into the parse tree. */
        tok->encoding = new_string(enc, strlen(enc), tok);
        if (!tok->encoding) {
            PyTokenizer_Free(tok);
            return NULL;
        }
        tok->decoding_state = STATE_NORMAL;
    }
    return tok;
}

请添加图片描述
初始化后，创建了一个8K的缓冲区，用于读取源代码进行处理

二、缓冲区

词法分析入口函数

int
PyTokenizer_Get(struct tok_state *tok, const char **p_start, const char **p_end)
{
    int result = tok_get(tok, p_start, p_end);
    if (tok->decoding_erred) {
        result = ERRORTOKEN;
        tok->done = E_DECODE;
    }
    return result;
}

其中tok_get函数是真正进行词法分析的处理函数。
调用tok_nextc获取一个一个的字符进行识别出不同的token。

static int
tok_nextc(struct tok_state *tok)
{
    int rc;
    for (;;) {
        if (tok->cur != tok->inp) {//第一次不会进来，buf中还没有数据，一会将读取部分数据到缓冲区中，后续就可以从buf中直接读取
            return Py_CHARMASK(*tok->cur++); /* Fast path */
        }
        if (tok->done != E_OK)
            return EOF;
        if (tok->fp == NULL) { //单纯的字符串，直接读取字符串中的字符
            rc = tok_underflow_string(tok);
        }
        else if (tok->prompt != NULL) { //交互模式下，直接从终端获取的字符串中获取字符
            rc = tok_underflow_interactive(tok);
        }
        else { //从文件中读取
            rc = tok_underflow_file(tok);
        }
 ...
        if (!rc) {//读取结束
            tok->cur = tok->inp;
            return EOF;
        }
        tok->line_start = tok->cur;
    }
    Py_UNREACHABLE();
}

2.1 从文件中读取一行到缓冲区中

static int
tok_underflow_file(struct tok_state *tok) {
    if (tok->start == NULL) {
        tok->cur = tok->inp = tok->buf;
    }
    if (tok->decoding_state == STATE_INIT) {
        /* We have not yet determined the encoding.
           If an encoding is found, use the file-pointer
           reader functions from now on. */
        if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {
            error_ret(tok);
            return 0;
        }
        assert(tok->decoding_state != STATE_INIT);
    }
    /* Read until '\n' or EOF */
    if (tok->decoding_readline != NULL) {
        /* We already have a codec associated with this input. */
        if (!tok_readline_recode(tok)) {
            return 0;
        }
    }
    else {
        /* We want a 'raw' read. */
        if (!tok_readline_raw(tok)) {
            return 0;
        }
    }
    if (tok->inp == tok->cur) {
        tok->done = E_EOF;
        return 0;
    }
    if (tok->inp[-1] != '\n') {
        /* Last line does not end in \n, fake one */
        *tok->inp++ = '\n';
        *tok->inp = '\0';
    }

    tok->lineno++;
    if (tok->decoding_state != STATE_NORMAL) {
        if (tok->lineno > 2) {
            tok->decoding_state = STATE_NORMAL;
        }
        else if (!check_coding_spec(tok->cur, strlen(tok->cur),
                                    tok, fp_setreadl))
        {
            return 0;
        }
    }
    /* The default encoding is UTF-8, so make sure we don't have any
       non-UTF-8 sequences in it. */
    if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
        error_ret(tok);
        return 0;
    }
    assert(tok->done == E_OK);
    return tok->done == E_OK;
}

2.1.1 检查文件是否为BOM编码

因为tok->decoding_state == STATE_INIT将进入check_bom函数进行检查是否为bom格式，如果开头是0xEF 0xBB 0x0xBF,则将使用utf-8编码进行转码。解析状态进入STATE_SEEK_CODING状态。

/* See whether the file starts with a BOM. If it does,
   invoke the set_readline function with the new encoding.
   Return 1 on success, 0 on failure.  */

static int
check_bom(int get_char(struct tok_state *),
          void unget_char(int, struct tok_state *),
          int set_readline(struct tok_state *, const char *),
          struct tok_state *tok)
{
    int ch1, ch2, ch3;
    ch1 = get_char(tok);
    tok->decoding_state = STATE_SEEK_CODING;
    if (ch1 == EOF) {
        return 1;
    } else if (ch1 == 0xEF) {
        ch2 = get_char(tok);
        if (ch2 != 0xBB) {
            unget_char(ch2, tok);
            unget_char(ch1, tok);
            return 1;
        }
        ch3 = get_char(tok);
        if (ch3 != 0xBF) {
            unget_char(ch3, tok);
            unget_char(ch2, tok);
            unget_char(ch1, tok);
            return 1;
        }
    } else {
        unget_char(ch1, tok);
        return 1;
    }
    if (tok->encoding != NULL)
        PyMem_Free(tok->encoding);
    tok->encoding = new_string("utf-8", 5, tok);
    if (!tok->encoding)
        return 0;
    /* No need to set_readline: input is already utf-8 */
    return 1;
}

2.1.2 读取一行数据

tok_readline_raw

static int
tok_readline_raw(struct tok_state *tok)
{
    do {
    	//扩容，如果需要
        if (!tok_reserve_buf(tok, BUFSIZ)) {
            return 0;
        }
        
        //读取一行数据
        char *line = Py_UniversalNewlineFgets(tok->inp,
                                              (int)(tok->end - tok->inp),
                                              tok->fp, NULL);
        if (line == NULL) {
            return 1;
        }
        if (tok->fp_interactive &&
            tok_concatenate_interactive_new_line(tok, line) == -1) {
            return 0;
        }
        tok->inp = strchr(tok->inp, '\0');
        if (tok->inp == tok->buf) {
            return 0;
        }
    } while (tok->inp[-1] != '\n');
    return 1;
}

Py_UniversalNewlineFgets函数读取一行数据，并且将\r或\r\n替换为\n
请添加图片描述

2.1.3 最后一行追加\n

文件最后一行可能没有\n，如果没有则需要添加

static int
tok_underflow_file(struct tok_state *tok) {
	...
  if (tok->inp[-1] != '\n') {
        /* Last line does not end in \n, fake one */
        *tok->inp++ = '\n';
        *tok->inp = '\0';
    }
	...
}

2.1.4 检查具体编码

从文件开头两行中检查编码，现在decoding_state的状态为STATE_SEEK_CODING，并且lineno为1，所以将进入到具体的编码检测中。

static int
tok_underflow_file(struct tok_state *tok) {
 ...
 tok->lineno++;
    if (tok->decoding_state != STATE_NORMAL) {
        if (tok->lineno > 2) {
            tok->decoding_state = STATE_NORMAL;
        }
        else if (!check_coding_spec(tok->cur, strlen(tok->cur),
                                    tok, fp_setreadl))
        {
            return 0;
        }
    }
...
}

/* Check whether the line contains a coding spec. If it does,
   invoke the set_readline function for the new encoding.
   This function receives the tok_state and the new encoding.
   Return 1 on success, 0 on failure.  */

static int
check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
                  int set_readline(struct tok_state *, const char *))
{
    char *cs;
    if (tok->cont_line) {
        /* It's a continuation line, so it can't be a coding spec. */
        tok->decoding_state = STATE_NORMAL;
        return 1;
    }
    if (!get_coding_spec(line, &cs, size, tok)) {
        return 0;
    }
    if (!cs) {
        Py_ssize_t i;
        for (i = 0; i < size; i++) {
            if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
                break;
            if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
                /* Stop checking coding spec after a line containing
                 * anything except a comment. */
                tok->decoding_state = STATE_NORMAL;
                break;
            }
        }
        return 1;
    }
    tok->decoding_state = STATE_NORMAL;
    if (tok->encoding == NULL) {
        assert(tok->decoding_readline == NULL);
        if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
            error_ret(tok);
            PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
            PyMem_Free(cs);
            return 0;
        }
        tok->encoding = cs;
    } else {                /* then, compare cs with BOM */
        if (strcmp(tok->encoding, cs) != 0) {
            error_ret(tok);
            PyErr_Format(PyExc_SyntaxError,
                         "encoding problem: %s with BOM", cs);
            PyMem_Free(cs);
            return 0;
        }
        PyMem_Free(cs);
    }
    return 1;
}

其中函数get_coding_spec从单行的注释中搜索coding:xxx 或者coding=xxx
decoding_state进入STATE_NORMAL状态
如果找到编码方式xxx, 并且不等于utf-8，则调用回调函数set_readline设置编码

2.1.5 判断默认编码方式下读取的数据是否是合法的utf8

因为hello.py中第一行或者第二行中并未指定编码方式，所以默认是utf8编码，并且不是BOM编码的文件，所以tok->encoding=NULL。

static int
tok_underflow_file(struct tok_state *tok) {
 ...
 /* The default encoding is UTF-8, so make sure we don't have any
       non-UTF-8 sequences in it. */
    if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
        error_ret(tok);
        return 0;
    }
...
}

static int
ensure_utf8(char *line, struct tok_state *tok)
{
    int badchar = 0;
    unsigned char *c;
    int length;
    for (c = (unsigned char *)line; *c; c += length) {
        if (!(length = valid_utf8(c))) {
            badchar = *c;
            break;
        }
    }
    if (badchar) {
        /* Need to add 1 to the line number, since this line
       has not been counted, yet.  */
        PyErr_Format(PyExc_SyntaxError,
                     "Non-UTF-8 code starting with '\\x%.2x' "
                     "in file %U on line %i, "
                     "but no encoding declared; "
                     "see https://python.org/dev/peps/pep-0263/ for details",
                     badchar, tok->filename, tok->lineno + 1);
        return 0;
    }
    return 1;
}

/* Check whether the characters at s start a valid
   UTF-8 sequence. Return the number of characters forming
   the sequence if yes, 0 if not.  */
static int valid_utf8(const unsigned char* s)
{
    int expected = 0;
    int length;
    if (*s < 0x80)
        /* single-byte code */
        return 1;
    if (*s < 0xc0)
        /* following byte */
        return 0;
    if (*s < 0xE0)
        expected = 1;
    else if (*s < 0xF0)
        expected = 2;
    else if (*s < 0xF8)
        expected = 3;
    else
        return 0;
    length = expected + 1;
    for (; expected; expected--)
        if (s[expected] < 0x80 || s[expected] >= 0xC0)
            return 0;
    return length;
}

当一行读取到缓冲区buf中后，结构图如下
请添加图片描述

2.2 获取一个字符

现在tok->cur != tok->inp成立，直接取出当前字符。

static int
tok_nextc(struct tok_state *tok)
{
    int rc;
    for (;;) {
        if (tok->cur != tok->inp) {
            return Py_CHARMASK(*tok->cur++); /* Fast path */
        }
       ...
	}
...
}

/* Argument must be a char or an int in [-128, 127] or [0, 255]. */
#define Py_CHARMASK(c) ((unsigned char)((c) & 0xff))

比如现在取出字符’d’,‘e’,'f’后的结构
请添加图片描述

python3词法分析（二）输入缓冲区