Wx::

编译原理 · 词法分析器

将源代码编译成可执行机器指令的第一步,会被后续的语法(如算符优先文法、LL 文法、LR 文法)分析器调用。

功能

  • 识别出连续「字符串」中单独的「符号」,将字符串转换为符号串
  • 并标记每个符号的性质(以 种别码 标识),如关键字、常字符串、常数等

基本原理

即从前至后不断读取字符,直至可判断出当前一小段串(token)的类型时,将其归为一个符号(symbol)。
专业点说称其为自动机,一般为确定有限自动机 DFA 实现。

符号(Symbol)

符号,即为一个具有某些性质单词。其作为词法分析器及之后语法分析的基本元素。将其简单封装为一个「类」,有以下定义:

具体定义:-brev-

class Symbol {
public:
    string data;
    int type; // 种别码

    Symbol() {
        this->data = "";
    }
    Symbol(string s, int t = -1) {
        this->data = s;
        this->type = t;
    }
    Symbol(char s, int t = -1) {
        this->data = s;
        this->type = t;
    }

    bool operator == (const string &a) {
        return (this->data.compare(a) == 0);
    }
    bool operator == (const Symbol &s) {
        return (data == s.data);
    }
    bool operator != (const string &a) {
        return (data != a);
    }
    bool operator != (const Symbol &s) {
        return (data != s.data);
    }
};

bool operator < (const Symbol &s1, const Symbol &s2) {
    // 重载小于号以使 Symbol 可作为 set 的元素
    return (s1.data < s2.data);
}

over

词法分析器

准备过程

  • 处理待分析源文本 prepareSrcCode() ,即处理空白符等,调用 handleCode(string code)

  • 将原始关键词数组(成员常量 key)加工为 关键词-种别码 的映射关系(成员变量 keywords

  • 开始分析 process()

分析器定义

class LexicalAnalyzer {
public:
    LexicalAnalyzer(string code);
    LexicalAnalyzer(string srcPath, string outPath);
    ~LexicalAnalyzer() {
        if (fout.is_open()) {
            fout.close();
        }
    }

    // 设置关键词:
    void setKeywords(const vector<string> &v);
    // 在默认关键词基础上添加:
    void addKeywords(const vector<string> &v);
    // 分析主过程:
    pair<vector<Symbol>, bool> process(string code); 

    static const int VarTypeNum = 25;
    static const int IntTypeNum = 26;
    static const int StringTypeNum = 100; // 字符串种别码
    static const int DoubleTypeNum = 101;

private:
    ifstream fin;
    ofstream fout;
    string srcFilePath = ""; // 源文件路径
    string outFilePath = ""; // 输出文件路径

    map<string, int>keywords; // 定义的关键词集
    set<Symbol> symbolResults; // 分析结果符号集
    string src; // 将要分析的原文本

    // 默认关键词数组 key 为 c 语言及部分 c++ 关键词:
    const vector<string> key = { "#","main","if","then","while","do","static","int","double","struct","break","else","long","switch","case","typedef","char","return","const","float","short","continue","for","void","sizeof","+","-","*","/",":",":=","<","<>","<=",">",">=","=","default","null",";","(",")","[","]",",","{","}","&&","||","!","!=","<<",">>",".","%","++","--","+=","-=",",","*=","/=","%=","&","|","~","^","!=","&=","|=","^=","<<=",">>=","signed","unsigned","enum","extern","union","==", "?" };

    // 由成员 key 数组处理为 keywords 映射:
    void prepareKeywords(const vector<string> &v);
    // 加工源代码:
    bool prepareSrcCode();
    // 处理一段代码:
    string handleCode(string code);
};

构造函数

具体定义:-brev-

LexicalAnalyzer::LexicalAnalyzer(string code = "") {
    if (code == "") {
        src = "";
        return;
    }
    else {
        src = handleCode(code);
    }
}

LexicalAnalyzer::LexicalAnalyzer(string srcPath, string outPath) {
    srcFilePath = srcPath;
    outFilePath = outPath;
    prepareSrcCode();
}

over

准备过程

具体定义:-brev-

string LexicalAnalyzer::handleCode(string code) {
    string s;
    int i = 0;

    // 跳过前缀空白符:
    for (; isBlank(code[i]); i++) {}

    for (; i < code.size(); i++) {
        if (code[i] == '/' && code[i + 1] == '/') {
            // 跳过注释
            break;
        }
        if (!isBlank(code[i]) || (code[i] == ' ' && code[i + 1] != ' ')) {
            s += code[i];
        }
    }
    return s;
}

bool LexicalAnalyzer::prepareSrcCode() {
    // 读取输入文件并处理其源文本
    // 返回是否成功

    if (srcFilePath.compare("") == 0) {
        cout << "error: src file path undefined!" << endl;
        return false;
    }
    if (!fin.is_open()) {
        fin.open(srcFilePath);
    }
    src = "";

    string t;
    while (getline(fin, t)) {
        src += handleCode(t);
    }
    if (outFilePath.compare("") != 0) {
        if (!fout.is_open()) {
            fout.open(outFilePath);
        }
        fout << "prepared source code:\n" << src << endl;
    }
    fin.close();
    return true;
}

void LexicalAnalyzer::setKeywords(const vector<string> &v) {
    prepareKeywords(v);
}

void LexicalAnalyzer::addKeywords(const vector<string> &v) {
    // 在原有关键词基础上添加

    vector<string> res;
    res.insert(res.end(), key.begin(), key.end());
    res.insert(res.end(), v.begin(), v.end());
    prepareKeywords(res);
}

void LexicalAnalyzer::prepareKeywords(const vector<string> &v = {}) {
    // 将字符串数组 v 或成员常量 key 写入关键字-种别码的 map表:

    // 变量 ID 种别码为 25
    // 字符串种别码为成员变量 StringNum 的值
    // 0~25 及 27~ 之后由 key 数组顺序而定

    vector<string> key;
    if (v.size()) {
        key = v;
    }
    else {
        key = this->key;
    }
    keywords.clear();
    int i = 0;
    for (; i <= 24; i++) {
        keywords[key[i]] = i;
    }
    for (int j = 27; i < key.size(); j++,i++) {
        keywords[key[i]] = j;
    }
}

over

主体分析过程

具体定义:-brev-

pair<vector<Symbol>, bool> LexicalAnalyzer::process(string code="") {
    // 为了使语法分析器调用,此处直接返回符号的串和出错标志的 pair 值

    if(!keywords.size()){
        // 处理准备:
        prepareKeywords();
    }

    if (code.compare("") != 0) {
        // 若参数 code 不为空,则此次解析 code 而非原有成员变量 src
        src = handleCode(code);
    }

    // 开始分析:
    bool error = false;
    string token = "";
    bool doubleFlag = false;
    stringstream ss;

    vector<Symbol> symbols; // 作为单独的符号串结果返回

    for (int i = 0; i < src.size();) {
        if ((src[i] >= 'a'&&src[i] <= 'z') || (src[i] >= 'A'&&src[i] <= 'Z')) {
            while (i<src.size() && (src[i] >= 'a' && src[i] <= 'z') || (src[i] >= 'A' && src[i] <= 'Z') || src[i] == '_' || (src[i] >= '0'&&src[i] <= '9') || src[i]=='\'') {
                // 为了支持文法中的类似于 E' 的符号,在 while 最后包含了这个判断条件
                token += src[i];
                i++;
            }
            if (keywords.count(token)) {
                //symbolResults[token] = keywords[token];
                symbolResults.insert(Symbol(token, keywords[token])); // 用括号的形式隐形调用 Symbol 构造函数
            }
            else {
                //symbolResults[token] = 25;
                symbolResults.insert(Symbol(token, VarTypeNum));
            }
            symbols.push_back(Symbol(token, VarTypeNum));
            token = "";
        }
        else if (src[i] >= '0'&&src[i] <= '9') {
            // 数值
            doubleFlag = false;
            while (i<src.size()) {
                if (src[i] >= '0' && src[i] <= '9') {
                    token += src[i];
                    i++;
                }
                else if (src[i] == '.') {
                    token += src[i];
                    doubleFlag = true;
                    i++;
                }
                else {
                    break;
                }
            }
            symbols.push_back(Symbol(token, DoubleTypeNum));
            //ss.clear();
            //ss << token;
            if (doubleFlag) {
                //ss >> dsum;
                symbolResults.insert(Symbol(token, DoubleTypeNum));
                //doubleResults.insert(make_pair(dsum, 26));
            }
            else {
                //ss >> isum;
                symbolResults.insert(Symbol(token, IntTypeNum));
                //intResults.insert(make_pair(isum, 26));
            }
            token = "";
        }
        else if (isBlank(src[i])) {
            i++;
        }
        else if (src[i] == '"' || src[i] == '\'') {
            // 常量字符串
            char quote = src[i];
            i++; // 从引号下一个字符开始读取常量字符串
            while (i<src.size() && src[i] != '\n' && src[i] != quote) {
                token += src[i];
                if (src[i] == '\\') {
                    // 处理转移符 '\'(这个转移符不加单引号会使下一行出 bug)
                    i++;
                    token += src[i];
                }
                i++;
            }
            if (src[i] == quote) {
                // 正确结束字符串
                symbolResults.insert(Symbol(token, StringTypeNum));
                //stringResults[token] = StringNum;
                token = "";
                i++;
            }
            else {
                // 未结束字符串
                error = true;
                cout << "error: 引号" << endl;
                cout << "\tpos: " << i << endl;
                cout << "\tchar: " << src[i] << endl;
                cout << "\ttoken: " << token << endl;
                break;
            }

        }
        else {
            // 非字母关键字(特殊符号)
            bool ok = false;
            while (i<src.size()) {
                token += src[i];
                if (keywords.count(token)) {
                    if (i == src.size() - 1 || keywords.count(token + src[i + 1]) == 0) {
                        i++;
                    }
                    else if (i == src.size() - 2 || keywords.count(token + src[i + 1] + src[i + 2]) == 0) {
                        token += src[i + 1];
                        i += 2;
                    }
                    else {
                        token += src[i + 1];
                        token += src[i + 2];
                        i += 3;
                    }
                    symbolResults.insert(Symbol(token, keywords[token]));
                    //symbolResults[token] = keywords[token];
                    symbols.push_back(Symbol(token, keywords[token]));
                    token = "";
                    ok = true;
                    break;
                }
                else {
                    break;
                }
            }
            if (i < src.size() && !ok) {
                error = true;
                cout << "error: 特殊符号" << endl;
                cout << "\tpos: " << i << endl;
                cout << "\tchar: " << src[i] << endl;
                cout << "\ttoken: " << token << endl;
                break;
            }
        }
    }

    if (outFilePath.compare("") != 0) {
        if (!fout.is_open()) {
            fout.open(outFilePath);
        }

        fout << "\nkeyword results:" << endl;
        for (auto it = symbolResults.begin(); it != symbolResults.end(); it++) {
            if (it->type != IntTypeNum && it->type!=StringTypeNum && it->type!=DoubleTypeNum && it->type != VarTypeNum) {
                fout << setiosflags(ios::left) << setw(25) << it->data << it->type << endl;
            }
        }
        fout << "\nvar results:" << endl;
        for (auto it = symbolResults.begin(); it != symbolResults.end(); it++) {
            if (it->type == VarTypeNum) {
                fout << setiosflags(ios::left) << setw(25) << it->data << it->type << endl;
            }
        }
        fout << "\nstring results:" << endl;
        for (auto it = symbolResults.begin(); it != symbolResults.end(); it++) {
            if(it->type == StringTypeNum)
                fout << ("\"" + it->data + "\"") << endl;
        }
        fout << "\nnumber results:" << endl;
        for (auto it = symbolResults.begin(); it != symbolResults.end(); it++) {
            if(it->type == IntTypeNum)
                fout << setiosflags(ios::left) << setw(25) << it->data << it->type << endl;
        }
        for (auto it = symbolResults.begin(); it != symbolResults.end(); it++) {
            if(it->type == DoubleTypeNum)
                fout << setiosflags(ios::left) << setw(25) << it->data << it->type << endl;
        }

        fout.close();
    }
    return make_pair(symbols,!error);
}

over


315

评论(0

评论 取消
验证码:
搜索