词法分析

一个简单的词法分析程序如下：

开始；
调用识别器；
判断是否为关键字或标识符，如果是，跳转到步骤 4；如果否，跳转到步骤 5；
查关键字表(KT表，keyword table)，如果是关键字则记录该标记该值为K.TOKEN；否则查填标识符表(IT表，identifier table)，识别该值为I.TOKEN；
判断是否为算术常数，如果是，按常数处理，查填常数表(CT表，const table)，识别该值为C.TOKEN；否则，跳转到步骤 6；
判断是否为结束符，如果是，则结束词法分析程序；否则，查界符表(PT 表)，识别该值为P.TOKEN。

以上提到了几种 token：K.TOKEN、I.TOKEN、C.TOKEN、P.TOKEN，以下面一段 C 程序为例说明：

int foo() {
    return 1;
}

以上程序的词法分析结果如下表：

value	int	foo	(	)	{	return	1	;	}
type	K.TOKEN	I.TOKEN	P.TOKEN	P.TOKEN	P.TOKEN	K.TOKEN	C.TOKEN	P.TOKEN	P.TOKEN

其中关键字表如下：

int

return

界符表如下：

(

)

{

;

}

简单的词法分析程序

以下程序用于分析上述 C 语言程序的词法。由于是示例程序，就没有进行太多的优化及词法的完善。

package main

import "fmt"

type TokenType string

type Token struct {
    Literal string
    Type    TokenType
}

const (
    EOF     = "EOF"
    ILLEGAL = "ILLEGAL"

    // 标识符
    IDENTIFIER = "IDENTIFIER"

    // 整数
    INTEGER = "INTEGER"

    // 界符
    LPAREN    = "("
    RPAREN    = ")"
    LBRACE    = "{"
    RBRACE    = "}"
    SEMICOLON = ";"

    // 关键字
    INT    = "INT"
    RETURN = "RETURN"
)

var keywords = map[string]TokenType{
    "int":    INT,
    "return": RETURN,
}

// 检查标识符是否关键字
func LookupIdent(ident string) TokenType {
    if tk, ok := keywords[ident]; ok {
                                         return tk
                                         }
    return IDENTIFIER
}

// 创建 Token
func newToken(tokenType TokenType, ch byte) Token {
    return Token{Type: tokenType, Literal: string(ch)}
}

type Lexer struct {
    input        string
    position     int
    readPosition int
    ch           byte
}

func New(input string) *Lexer {
    l := &Lexer{input: input}
    l.readChar()
    return l
}

func (l *Lexer) readChar() {
    if l.readPosition >= len(l.input) {
        l.ch = 0 // NUL，表示到了源码的末尾
    } else {
        l.ch = l.input[l.readPosition]
    }

    l.position = l.readPosition
    l.readPosition += 1
}

func (l *Lexer) skipWhitespace() {
    for l.ch == ' ' || l.ch == '\t' || l.ch == '\n' || l.ch == '\r' {
        l.readChar()
    }
}

func isLetter(ch byte) bool {
    return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_'
}

// 读取数值，暂时只能读取整数
func (l *Lexer) readNumber() string {
    position := l.position
    for isDigit(l.ch) {
        l.readChar()
    }
    return l.input[position:l.position]
}

func isDigit(ch byte) bool {
    return '0' <= ch && ch <= '9'
}

func (l *Lexer) readIdentifier() string {
    position := l.position
    for isLetter(l.ch) {
        l.readChar()
    }
    return l.input[position:l.position]
}

func (l *Lexer) NextToken() Token {
    var tk Token

    l.skipWhitespace()

    switch l.ch {
    case 0:
        tk.Literal = ""
        tk.Type = EOF
    case '(':
        tk = newToken(LPAREN, l.ch)
    case ')':
        tk = newToken(RPAREN, l.ch)
    case '{':
        tk = newToken(LBRACE, l.ch)
    case '}':
        tk = newToken(RBRACE, l.ch)
    case ';':
        tk = newToken(SEMICOLON, l.ch)
    default:
        if isLetter(l.ch) {
            tk.Literal = l.readIdentifier()
            tk.Type = LookupIdent(tk.Literal)
            return tk
        } else if isDigit(l.ch) {
            tk.Type = INTEGER
            tk.Literal = l.readNumber()
            return tk
        } else {
            tk = newToken(ILLEGAL, l.ch)
        }
    }

    l.readChar()
    return tk
}

func main() {
    input := `
int foo() {
    return 1;
}
`
    l := New(input)

    for {
        tk := l.NextToken()
        fmt.Println(tk)
        if tk.Type == EOF || tk.Type == ILLEGAL {
            break
        }
    }
}

以上词法分析程序将 C 语言的代码作为输入，开始调用识别器；然后一个接着一个字节地读取；遇到特定字节时，识别为不同的 token。为了代码的清晰，将开头的步骤调换了一下顺序，但并不影响最终的结果。

William A

简单的词法分析程序

Search

Table of Contents