从一个小型编译器一窥编译原理的本质

2025/02/27 编译原理

编译器可以粗略地分为几个简单的部分:词法分析器、语法分析器、代码生成器。其中语义分析器、代码优化器这些不影响本文示例功能。另外,本文的编译器生成的是字节码,因此还需要一个虚拟机来执行。

Lexer 词法分析器

Lexer的功能是将代码串不同的符号进行标记分组,例如(3 + 5) * 2可以拆分为不同的token,分别是(3+5)*2,包含五个分类,有左括号、右括号、加号、乘号以及整数,其中的空白符号需要忽略。

class Lexer
{
    private $input;
    private $pos = 0;
    private $currentChar;

    public function __construct($input)
    {
        $this->input = $input;
        $this->currentChar = $input[0] ?? null;
    }

    private function advance()
    {
        $this->pos++;
        $this->currentChar = $this->input[$this->pos] ?? null;
    }

    private function skipWhitespace()
    {
        while ($this->currentChar !== null && ctype_space($this->currentChar)) {
            $this->advance();
        }
    }

    private function integer()
    {
        $result = '';
        while ($this->currentChar !== null && ctype_digit($this->currentChar)) {
            $result .= $this->currentChar;
            $this->advance();
        }
        return (int)$result;
    }

    private function identifier()
    {
        $result = '';
        while ($this->currentChar !== null && (ctype_alnum($this->currentChar) || $this->currentChar === '_')) {
            $result .= $this->currentChar;
            $this->advance();
        }
        return $result;
    }

    public function getNextToken()
    {
        while ($this->currentChar !== null) {
            // 忽略空白符
            if (ctype_space($this->currentChar)) {
                $this->skipWhitespace();
                continue;
            }

            // 当前字符为整型时,扫描后续部分的整型字符
            if (ctype_digit($this->currentChar)) {
                return ['type' => 'INTEGER', 'value' => $this->integer()];
            }

            // 当前字符为字母时,扫描后续部分的字母或数字字符
            if (ctype_alpha($this->currentChar)) {
                $id = $this->identifier();
                // 如果扫描到的 token 为 print,则将其标记为关键词 print
                if (strtoupper($id) === 'PRINT') {
                    return ['type' => 'PRINT', 'value' => $id];
                }
                return ['type' => 'ID', 'value' => $id];
            }

            // 普通的符号
            switch ($this->currentChar) {
                case '=':
                    $this->advance();
                    return ['type' => 'ASSIGN', 'value' => '='];
                case '+':
                    $this->advance();
                    return ['type' => 'PLUS', 'value' => '+'];
                case '-':
                    $this->advance();
                    return ['type' => 'MINUS', 'value' => '-'];
                case '*':
                    $this->advance();
                    return ['type' => 'MUL', 'value' => '*'];
                case '/':
                    $this->advance();
                    return ['type' => 'DIV', 'value' => '/'];
                case ';':
                    $this->advance();
                    return ['type' => 'SEMI', 'value' => ';'];
                case '(':
                    $this->advance();
                    return ['type' => 'LPAREN', 'value' => '('];
                case ')':
                    $this->advance();
                    return ['type' => 'RPAREN', 'value' => ')'];
                default:
                    throw new Exception("Invalid character: " . $this->currentChar);
            }
        }

        return ['type' => 'EOF', 'value' => null];
    }
}

Parser 语法分析器

Parser遍历token,生成AST

class Parser
{
    private $lexer;
    private $currentToken;

    public function __construct($lexer)
    {
        $this->lexer = $lexer;
        $this->currentToken = $lexer->getNextToken();
    }

    // 用于消费一个指定类型的 token
    private function eat($tokenType)
    {
        if ($this->currentToken['type'] === $tokenType) {
            $this->currentToken = $this->lexer->getNextToken();
        } else {
            throw new Exception("Syntax error: expected $tokenType");
        }
    }

    // 第一优先级的 token,分别是整型、标识符(这里是变量)、左括号(左括号会递归解释括号中的表达式)
    private function factor()
    {
        $token = $this->currentToken;

        if ($token['type'] === 'INTEGER') {
            $this->eat('INTEGER');
            return ['type' => 'NumberLiteral', 'value' => $token['value']];
        }

        if ($token['type'] === 'ID') {
            $this->eat('ID');
            return ['type' => 'Variable', 'name' => $token['value']];
        }

        if ($token['type'] === 'LPAREN') {
            $this->eat('LPAREN');
            $node = $this->expr();
            $this->eat('RPAREN');
            return $node;
        }

        throw new Exception("Unexpected factor: " . $token['type']);
    }

    // 第二优先级。乘除法,解释为一个二元表达式
    private function term()
    {
        $node = $this->factor();

        while (in_array($this->currentToken['type'], ['MUL', 'DIV'])) {
            $token = $this->currentToken;
            $this->eat($token['type']);
            $node = [
                'type' => 'BinaryOp',
                'left' => $node,
                'op' => $token['value'],
                'right' => $this->factor()
            ];
        }

        return $node;
    }

    // 第三优先级。加减法,解释为一个二元表达式
    private function expr()
    {
        $node = $this->term();

        while (in_array($this->currentToken['type'], ['PLUS', 'MINUS'])) {
            $token = $this->currentToken;
            $this->eat($token['type']);
            $node = [
                'type' => 'BinaryOp',
                'left' => $node,
                'op' => $token['value'],
                'right' => $this->term()
            ];
        }

        return $node;
    }

    // 目前只有两种语句啦:print 语句;赋值表达式
    private function statement()
    {
        if ($this->currentToken['type'] === 'PRINT') {
            $this->eat('PRINT');
            $expr = $this->expr();
            $this->eat('SEMI');
            return ['type' => 'PrintStatement', 'expr' => $expr];
        }

        if ($this->currentToken['type'] === 'ID') {
            $varName = $this->currentToken['value'];
            $this->eat('ID');
            $this->eat('ASSIGN');
            $expr = $this->expr();
            $this->eat('SEMI');
            return ['type' => 'Assignment', 'name' => $varName, 'expr' => $expr];
        }

        throw new Exception("Invalid statement");
    }

    public function parse()
    {
        $statements = [];
        while ($this->currentToken['type'] !== 'EOF') {
            $statements[] = $this->statement();
        }
        return ['type' => 'Program', 'body' => $statements];
    }
}

Code Generator 代码生成器

递归遍历AST,生成字节码。

class CodeGenerator
{
    private $ast;
    private $instructions = [];
    private $varIndex = 0;
    private $varMap = [];

    public function __construct($ast)
    {
        $this->ast = $ast;
    }

    private function generateExpr($node)
    {
        switch ($node['type']) {
            case 'NumberLiteral':
                $this->instructions[] = ['op' => 'PUSH', 'value' => $node['value']];
                break;
            case 'Variable':
                if (!isset($this->varMap[$node['name']])) {
                    $this->varMap[$node['name']] = $this->varIndex++;
                }
                $this->instructions[] = ['op' => 'LOAD', 'address' => $this->varMap[$node['name']]];
                break;
            case 'BinaryOp':
                $this->generateExpr($node['left']);
                $this->generateExpr($node['right']);
                switch ($node['op']) {
                    case '+':
                        $this->instructions[] = ['op' => 'ADD'];
                        break;
                    case '-':
                        $this->instructions[] = ['op' => 'SUB'];
                        break;
                    case '*':
                        $this->instructions[] = ['op' => 'MUL'];
                        break;
                    case '/':
                        $this->instructions[] = ['op' => 'DIV'];
                        break;
                }
                break;
        }
    }

    public function generate()
    {
        foreach ($this->ast['body'] as $stmt) {
            switch ($stmt['type']) {
                case 'PrintStatement':
                    $this->generateExpr($stmt['expr']);
                    $this->instructions[] = ['op' => 'PRINT'];
                    break;
                case 'Assignment':
                    $this->generateExpr($stmt['expr']);
                    if (!isset($this->varMap[$stmt['name']])) {
                        $this->varMap[$stmt['name']] = $this->varIndex++;
                    }
                    $this->instructions[] = ['op' => 'STORE', 'address' => $this->varMap[$stmt['name']]];
                    break;
            }
        }
        return $this->instructions;
    }
}

VM 虚拟机

VM执行Code Generator一步生成的字节码。

class VM
{
    private $code;
    private $stack = [];
    private $memory = [];
    private $pc = 0;

    public function __construct($code)
    {
        $this->code = $code;
    }

    public function run()
    {
        while ($this->pc < count($this->code)) {
            $instr = $this->code[$this->pc++];
            switch ($instr['op']) {
                case 'PUSH':
                    $this->stack[] = $instr['value'];
                    break;
                case 'LOAD':
                    $this->stack[] = $this->memory[$instr['address']] ?? 0;
                    break;
                case 'STORE':
                    $value = array_pop($this->stack);
                    $this->memory[$instr['address']] = $value;
                    break;
                case 'ADD':
                    $b = array_pop($this->stack);
                    $a = array_pop($this->stack);
                    $this->stack[] = $a + $b;
                    break;
                case 'SUB':
                    $b = array_pop($this->stack);
                    $a = array_pop($this->stack);
                    $this->stack[] = $a - $b;
                    break;
                case 'MUL':
                    $b = array_pop($this->stack);
                    $a = array_pop($this->stack);
                    $this->stack[] = $a * $b;
                    break;
                case 'DIV':
                    $b = array_pop($this->stack);
                    $a = array_pop($this->stack);
                    $this->stack[] = $a / $b;
                    break;
                case 'PRINT':
                    echo array_pop($this->stack) . PHP_EOL;
                    break;
            }
        }
    }
}

测试代码

$input = <<<CODE
print (3 + 5) * 2;
a = 10 - 2;
print a * 3;
b = a * 5;
print b;
CODE;

try {
    // 词法分析
    $lexer = new Lexer($input);

    // 语法分析生成AST
    $parser = new Parser($lexer);
    $ast = $parser->parse();

    // 生成中间代码
    $generator = new CodeGenerator($ast);
    $code = $generator->generate();

    // 虚拟机执行
    $vm = new VM($code);
    $vm->run();
} catch (Exception $e) {
    echo "Error: " . $e->getMessage();
}

Search

    Table of Contents