Building an AST from a Regular Expression
This post was originally featured as a section of my post on implementing Thompsons Construction for NFA from a regular expression. Having since implemented several FA constructions, all of which utilized an AST representation of the given expression, I decided to give the topic It's own post. When it comes to the actual construction of the AST, it can be done either implicitly or explicitly, bottom up or top down. In this post I'm going to cover the construction of an explicit AST from a regular expression, bottom up via operator precedence. An alternative method is to construct the AST using recursive descent, which proceeds top-down.
Implementing the parser
class Parser {
private:
bool isOp(char c);
int precedence(char c);
int leftAssociative(char c);
string addConcatOp(string& str);
string makePostfix(string& str);
RegularExpression* makeTree(string& postfix);
public:
Parser();
RegularExpression* parse();
};
bool leftAssociative(char c) {
switch (c) {
case '*':
case '+':
case '?':
case '|':
case '@':
return true;
default:
break;
}
return false;
}
When it comes to precedence, Closure's have the highest precedence, followed by concatenation and then alternation.
int precedence(char c) {
switch (c) {
case '*': return 50;
case '+': return 50;
case '?': return 50;
case '@': return 30;
case '|': return 20;
default:
break;
}
return 10;
}
Making Implicit Operators Explicit
string addConcatOp(string str) {
string fixed;
for (int i = 0; i < str.length(); i++) {
fixed.push_back(str[i]);
if (str[i] == '(' || str[i] == '|')
continue;
if (i+1 < str.length()) {
char p = str[i+1];
if (p == '|' || p == '*' || p == '+' || p == ')')
continue;
fixed.push_back('@');
}
}
return fixed;
}
Converting The Expression To Postfix
string makePostfix(string& str) {
str = addConcatOp(str);
cout<<"Inserting explicit concat operators: "<<str<<endl;
string postfix;
for (int i = 0; i < str.length(); i++) {
if (str[i] == '(') {
ops.push(str[i]);
} else if (str[i] == '|' || str[i] == '*' || str[i] == '+' || str[i] == '@' || str[i] == '?') {
if (precedence(str[i]) < precedence(ops.top()) || (precedence(str[i]) == precedence(ops.top()) && leftAssociative(str[i]))) {
char c = ops.pop();
postfix.push_back(c);
ops.push(str[i]);
} else {
ops.push(str[i]);
}
} else if (str[i] == ')') {
while (!ops.empty()) {
char c = ops.pop();
if (c == '(')
break;
else postfix.push_back(c);
}
} else {
postfix.push_back(str[i]);
}
}
while (!ops.empty()) {
char c = ops.pop();
if (c != '(')
postfix.push_back(c);
}
return postfix;
}
Building The AST
class RegularExpression {
public:
virtual char getSymbol() = 0;
virtual RegularExpression* getLeft() = 0;
virtual RegularExpression* getRight() = 0;
};
class ExpressionLiteral : public RegularExpression {
private:
char symbol;
public:
ExpressionLiteral(char sym) { symbol = sym; }
RegularExpression* getLeft() {
return nullptr;
}
RegularExpression* getRight() {
return nullptr;
}
char getSymbol() {
return symbol;
}
};
class ExpressionOperator : public RegularExpression {
private:
RegularExpression* left;
RegularExpression* right;
char symbol;
public:
ExpressionOperator(char c, RegularExpression* ll, RegularExpression* rr) {
symbol = c;
left = ll;
right = rr;
}
char getSymbol() {
return symbol;
}
RegularExpression* getLeft() {
return left;
}
RegularExpression* getRight() {
return right;
}
};
RegularExpression* makeTree(string postfix) {
Stack<RegularExpression*> sf;
for (char c : postfix) {
cout<<"Processing: "<<c<<" ";
if (!isOp(c)) {
cout<<"Alphabet."<<endl;
sf.push(new ExpressionLiteral(c));
} else {
cout<<"Operator."<<endl;
switch (c) {
case '@':
case '|': {
auto right = sf.empty() ? nullptr:sf.pop();
auto left = sf.empty() ? nullptr:sf.pop();
sf.push(new ExpressionOperator(c, left, right));
} break;
case '+':
case '?':
case '*':
sf.push(new ExpressionOperator(c, sf.empty() ? nullptr:pop(), nullptr);
break;
default:
break;
}
}
return sf.pop();
}
re_ast* makeTree(REToken* tokens) {
int len = tokensLength(tokens);
re_ast* st[len];
int sp = 0;
for (REToken* it = tokens; it != NULL; it = it->next) {
if (isOp(*it)) {
re_ast* n = makeNode(1, *it);
switch (it->symbol) {
case RE_OR:
case RE_CONCAT:
if (sp < 2) {
fprintf(stderr, "ERROR: stack underflow for binary operator %c\n", it->symbol);
exit(EXIT_FAILURE);
}
n->right = (sp > 0) ? st[--sp]:NULL;
n->left = (sp > 0) ? st[--sp]:NULL;
break;
case RE_STAR:
n->left = (sp > 0) ? st[--sp]:NULL;
break;
case RE_QUESTION: {
free(n);
n = makeNode(1, *makeToken(RE_OR, '|'));
n->left = (sp > 0) ? st[--sp]:NULL;
n->right = makeNode(2, *makeToken(RE_EPSILON, '&'));
} break;
case RE_PLUS: {
free(n);
n = makeNode(1, *makeToken(RE_CONCAT, '@'));
re_ast* operand = (sp > 0) ? st[--sp]:NULL;
n->left = operand;
n->right = makeNode(1, *makeToken(RE_STAR, '*'));
n->right->left = cloneTree(operand);
} break;
default: break;
}
st[sp++] = n;
} else {
re_ast* t = makeNode(2, *it);
st[sp++] = t;
}
}
return st[--sp];
}
I have used both methods in practice: the second method when implementing the straight to DFA construction, and the first when implementing thompsons construction of an NFA.
-
Building an AST from a Regular Expression
-
The Aho, Sethi, Ullman Direct DFA Construction Part 2: Building the DFA from Followpos
-
The Aho, Sethi, Ullman Direct DFA Construction, Part 1: Constructing the Followpos Table
-
Procedural Map Generation with Binary Space Partitioning
-
Exact Match String Searching: The Knuth-Morris-Pratt Algorithm
-
The visitor pattern: OOP takes on the Expression Problem
-
Improving mgcLisp's define syntax
-
Separating the Men from the Boys, Knuth Style
-
Reducing rotations during deletion from AVL trees
-
Parsing Lisp: From Data To Code
Leave A Comment