parsertl
is a modern, modular parser generator.
Currently it supports LALR(1), but eventually it may support IELR and LALR(k)
in the future.
parsertl is available in both C++03 and C++14 flavours.
The syntax is based on yacc/bison. In addition EBNF syntax is also suported.
Sequence |
Meaning |
---|---|
[ ... ] |
Optional. |
? |
Optional. |
{ ... } |
Zero or more. |
* |
Zero or more. |
{ ... }- |
One or more. |
+ |
One or more. |
( ... ) |
Grouping. |
#include <parsertl/generator.hpp> #include <parsertl/state_machine.hpp> int main(int argc, char *argv[]) { parsertl::rules rules_; parsertl::state_machine sm_; try { // Calculator from flex & bison book. rules_.token("INTEGER"); rules_.left("'+' '-'"); rules_.left("'*' '/'"); rules_.precedence("UMINUS"); rules_.push("start", "exp"); rules_.push("exp", "exp '+' exp"); rules_.push("exp", "exp '-' exp"); rules_.push("exp", "exp '*' exp"); rules_.push("exp", "exp '/' exp"); rules_.push("exp", "'(' exp ')'"); rules_.push("exp", "'-' exp %prec UMINUS"); rules_.push("exp", "INTEGER"); parsertl::generator::build(rules_, sm_); } catch (const std::exception &e) { std::cout << e.what() << '\n'; } return 0; }
#include <parsertl/generator.hpp> #include <parsertl/match_results.hpp> #include <parsertl/parse.hpp> #include <parsertl/state_machine.hpp> int main(int argc, char *argv[]) { parsertl::rules grules_; parsertl::state_machine gsm_; lexertl::rules lrules_; lexertl::state_machine lsm_; try { // Calculator from flex & bison book. grules_.token("INTEGER"); grules_.left("'+' '-'"); grules_.left("'*' '/'"); grules_.precedence("UMINUS"); grules_.push("start", "exp"); grules_.push("exp", "exp '+' exp"); grules_.push("exp", "exp '-' exp"); grules_.push("exp", "exp '*' exp"); grules_.push("exp", "exp '/' exp"); grules_.push("exp", "'(' exp ')'"); grules_.push("exp", "'-' exp %prec UMINUS"); grules_.push("exp", "INTEGER"); parsertl::generator::build(grules_, gsm_); lrules_.push("[+]", grules_.token_id("'+'")); lrules_.push("-", grules_.token_id("'-'")); lrules_.push("[*]", grules_.token_id("'*'")); lrules_.push("[/]", grules_.token_id("'/'")); lrules_.push("\\d+", grules_.token_id("INTEGER")); lrules_.push("[(]", grules_.token_id("'('")); lrules_.push("[)]", grules_.token_id("')'")); lrules_.push("\\s+", lsm_.skip()); lexertl::generator::build(lrules_, lsm_); std::string expr_("1 + 2 * -3"); lexertl::citerator iter_(expr_.c_str(), expr_.c_str() + expr_.size(), lsm_); parsertl::match_results results_(iter_->id, gsm_); bool success_ = parsertl::parse(iter_, gsm_, results_); } catch (const std::exception &e) { std::cout << e.what() << '\n'; } return 0; }
#include <parsertl/generator.hpp> #include <iostream> #include <lexertl/iterator.hpp> #include <parsertl/iterator.hpp> int main() { parsertl::rules grules_; parsertl::state_machine gsm_; lexertl::rules lrules_; lexertl::state_machine lsm_; try { grules_.token("INTEGER"); grules_.left("'+' '-'"); grules_.left("'*' '/'"); grules_.precedence("UMINUS"); grules_.push("start", "exp"); const std::size_t add_index_ = grules_.push("exp", "exp '+' exp"); const std::size_t sub_index_ = grules_.push("exp", "exp '-' exp"); const std::size_t mul_index_ = grules_.push("exp", "exp '*' exp"); const std::size_t div_index_ = grules_.push("exp", "exp '/' exp"); grules_.push("exp", "'(' exp ')'"); const std::size_t umin_index_ = grules_.push("exp", "'-' exp %prec UMINUS"); const std::size_t int_index_ = grules_.push("exp", "INTEGER"); parsertl::generator::build(grules_, gsm_); lrules_.push("[+]", grules_.token_id("'+'")); lrules_.push("-", grules_.token_id("'-'")); lrules_.push("[*]", grules_.token_id("'*'")); lrules_.push("[/]", grules_.token_id("'/'")); lrules_.push("\\d+", grules_.token_id("INTEGER")); lrules_.push("[(]", grules_.token_id("'('")); lrules_.push("[)]", grules_.token_id("')'")); lrules_.push("\\s+", lrules_.skip()); lexertl::generator::build(lrules_, lsm_); std::string expr_("1 + 2 * -3"); lexertl::citerator liter_(expr_.c_str(), expr_.c_str() + expr_.size(), lsm_); parsertl::citerator iter_(liter_, gsm_); std::stack<int> stack_; for (; iter_->entry.action != parsertl::action::error && iter_->entry.action != parsertl::action::accept; ++iter_) { const std::size_t rule_ = iter_->reduce_id(); if (rule_ == add_index_) { const int rhs_ = stack_.top(); stack_.pop(); stack_.top() = stack_.top() + rhs_; } else if (rule_ == sub_index_) { const int rhs_ = stack_.top(); stack_.pop(); stack_.top() = stack_.top() - rhs_; } else if (rule_ == mul_index_) { const int rhs_ = stack_.top(); stack_.pop(); stack_.top() = stack_.top() * rhs_; } else if (rule_ == div_index_) { const int rhs_ = stack_.top(); stack_.pop(); stack_.top() = stack_.top() / rhs_; } else if (rule_ == umin_index_) { stack_.top() *= -1; } else if (rule_ == int_index_) { stack_.push(atoi(iter_.dollar(0).first)); } } std::cout << "Result: " << stack_.top() << '\n'; } catch (const std::exception &e) { std::cout << e.what() << '\n'; } return 0; }
#include <parsertl/generator.hpp> #include <iostream> #include <parsertl/lookup.hpp> template<typename iterator> class shift_functor { public: shift_functor(const std::size_t integer_, std::stack<int> &values_) : _integer(integer_), _values(values_) { } void operator()(const iterator &iter_) { if (iter_->id == _integer) { _values.push(atoi(iter_->first)); } } private: const std::size_t _integer; std::stack<int> &_values; }; class reduce_functor { public: std::size_t _add; std::size_t _subtract; std::size_t _multiply; std::size_t _divide; std::size_t _uminus; reduce_functor(std::stack<int> &values_) : _add(~0), _subtract(~0), _multiply(~0), _divide(~0), _uminus(~0), _values(values_) { } void operator()(const std::size_t rule_) { if (rule_ == _add) { int rhs_ = 0; rhs_ = _values.top(); _values.pop(); _values.top() += rhs_; } else if (rule_ == _subtract) { int rhs_ = 0; rhs_ = _values.top(); _values.pop(); _values.top() -= rhs_; } else if (rule_ == _multiply) { int rhs_ = 0; rhs_ = _values.top(); _values.pop(); _values.top() *= rhs_; } else if (rule_ == _divide) { int rhs_ = 0; rhs_ = _values.top(); _values.pop(); _values.top() /= rhs_; } else if (rule_ == _uminus) { _values.top() *= -1; } } private: std::stack<int> &_values; }; int main(int argc, char *argv[]) { parsertl::rules grules_; parsertl::state_machine gsm_; parsertl::rules::string_vector symbols_; lexertl::rules lrules_; lexertl::state_machine lsm_; std::stack<int> values_; try { reduce_functor reduce_(values_); // Calculator from flex & bison book. grules_.token("INTEGER"); grules_.left("'+' '-'"); grules_.left("'*' '/'"); grules_.precedence("UMINUS"); grules_.push("start", "exp"); reduce_._add = grules_.push("exp", "exp '+' exp"); reduce_._subtract = grules_.push("exp", "exp '-' exp"); reduce_._multiply = grules_.push("exp", "exp '*' exp"); reduce_._divide = grules_.push("exp", "exp '/' exp"); grules_.push("exp", "'(' exp ')'"); reduce_._uminus = grules_.push("exp", "'-' exp %prec UMINUS"); grules_.push("exp", "INTEGER"); shift_functor<lexertl::citerator> shift_(grules_.token_id("INTEGER"), values_); parsertl::generator::build(grules_, gsm_); grules_.terminals(symbols_); grules_.non_terminals(symbols_); lrules_.push("[+]", grules_.token_id("'+'")); lrules_.push("-", grules_.token_id("'-'")); lrules_.push("[*]", grules_.token_id("'*'")); lrules_.push("[/]", grules_.token_id("'/'")); lrules_.push("\\d+", grules_.token_id("INTEGER")); lrules_.push("[(]", grules_.token_id("'('")); lrules_.push("[)]", grules_.token_id("')'")); lrules_.push("\\s+", lsm_.skip()); lexertl::generator::build(lrules_, lsm_); std::string expr_("1 + 2 * -3"); lexertl::citerator iter_(expr_.c_str(), expr_.c_str() + expr_.size(), lsm_); parsertl::match_results results_(iter_->id, gsm_); while (results_.entry.action != parsertl::action::error && results_.entry.action != parsertl::action::accept) { // Debug info switch (results_.entry.action) { case parsertl::action::error: throw std::runtime_error("Parser error"); break; case parsertl::action::shift: shift_(iter_); std::cout << 's' << results_.entry.param << '\n'; break; case parsertl::action::reduce: { parsertl::state_machine::size_t_size_t_pair &pair_ = gsm_._rules[results_.entry.param]; reduce_(results_.reduce_id()); std::cout << "reduce by " << symbols_[pair_.first] << " ->"; if (pair_.second.empty()) { std::cout << " %empty"; } else { for (auto iter_ = pair_.second.cbegin(), end_ = pair_.second.cend(); iter_ != end_; ++iter_) { std::cout << ' ' << symbols_[*iter_]; } } std::cout << '\n'; break; } case parsertl::action::go_to: std::cout << "goto " << results_.entry.param << '\n'; break; case parsertl::action::accept: std::cout << "accept\n"; break; } parsertl::lookup(iter_, gsm_, results_); } if (!values_.empty()) std::cout << '\n' << expr_ << " = " << values_.top() << '\n'; } catch (const std::exception &e) { std::cout << e.what() << '\n'; } return 0; }
It is now possible to search using a grammar as a sort of beefed up regex. Below I show how to search for uninitialised variables in C or C++ headers. Note how a grammar fragment is defined and yet we are only interested in the case where a variable is uninitialised. Note that this could occur in the middle of a comma separated list of variables.
There is an override for search()
that takes a multimap instead
of a set, should you wish to capture each rule that matched.
Finally, the last parameter to search()
is optional if you only care
about the entire grammar matching.
#include <parsertl/generator.hpp> #include <iostream> #include <parsertl/search.hpp> int main(int argc, char *argv[]) { parsertl::rules grules_; parsertl::state_machine gsm_; lexertl::rules lrules_; lexertl::state_machine lsm_; try { grules_.token("Bool Char Name NULLPTR Number String Type"); grules_.push("start", "decl"); grules_.push("decl", "Type list ';'"); grules_.push("list", "item " "| list ',' item"); const uint16_t uninit_idx_ = grules_.push("item", "Name"); const uint16_t init_idx_ = grules_.push("item", "Name '=' value"); grules_.push("value", "Bool " "| Char " "| Number " "| NULLPTR " "| String"); parsertl::generator::build(grules_, gsm_); lrules_.insert_macro("NAME", "[_A-Za-z][_0-9A-Za-z]*"); lrules_.push("=", grules_.token_id("'='")); lrules_.push(",", grules_.token_id("','")); lrules_.push(";", grules_.token_id("';'")); lrules_.push("true|TRUE|false|FALSE", grules_.token_id("Bool")); lrules_.push("nullptr", grules_.token_id("NULLPTR")); lrules_.push("BOOL|BSTR|BYTE|COLORREF|D?WORD|DWORD_PTR|DROPEFFECT|HACCEL|HANDLE|" "HBITMAP|HBRUSH|HCRYPTHASH|HCRYPTKEY|HCRYPTPROV|HCURSOR|HDBC|HICON|" "HINSTANCE|HMENU|HMODULE|HSTMT|HTREEITEM|HWND|LPARAM|LPCTSTR|" "LPDEVMODE|POSITION|SDWORD|SQLHANDLE|SQLINTEGER|SQLSMALLINT|UINT|" "U?INT_PTR|UWORD|WPARAM|" "bool|(unsigned\\s+)?char|double|float|" "(unsigned\\s+)?int((32|64)_t)?|long|size_t|" "{NAME}(\\s*::\\s*{NAME})*(\\s*[*])+", grules_.token_id("Type")); lrules_.push("{NAME}", grules_.token_id("Name")); lrules_.push("-?\\d+([.]\\d+)?", grules_.token_id("Number")); lrules_.push("'([^']|\\\\')*'", grules_.token_id("Char")); lrules_.push("[\"]([^\"]|\\\\[\"])*[\"]", grules_.token_id("String")); lrules_.push("[ \t\r\n]+|[/][/].*|[/][*](.|\n)*?[*][/]", lrules_.skip()); lexertl::generator::build(lrules_, lsm_); lexertl::memory_file mf_("D:\\Ben\\Dev\\scratchpad\\test.h"); lexertl::citerator iter_(mf_.data(), mf_.data() + mf_.size(), lsm_); lexertl::citerator end_; std::set<uint16_t> set_; do { if (parsertl::search(iter_, end_, gsm_, &set_)) if (set_.find(uninit_idx_) != set_.end()) std::cout << std::string(iter_->first, end_->first) << '\n'; iter_ = end_; } while (iter_->id != 0); } catch (const std::exception &e) { std::cout << e.what() << '\n'; } return 0; }
Hi Peter, thanks very much for your feedback!
I don't currently have a code generator for parsertl (the "tl" simply stands for template library by the way!)
It's interesting you should ask as I was looking into this this week due to wanting to use the PostgreSQL grammar. What I have realised is that for large (very large!) grammars like this, the lack of compression produces tables that are far too big to be practical.
It would be worth having a code generator anyway for completeness, but it makes me wonder about bringing in compression by default like bison does.
As I am eager to get this SQL grammar sorted out I will simply use bison to generate the tables and then scrape them using, you guessed it, parsertl.
I will produce a minimal yyparse() type lookup similar to parsertl::lookup() and see what gives. Of course I'm thinking that routine will be re-usable and you will be able to point it at the tables you want to use.
It's nice to have a serious parsing requirement (finally) and it has inspired me to think about this stuff again. I have a paper for minimal LR(1) generation which I would also like to look at properly and ultimately it would be great to support LR(*). A lot of grammars are in EBNF these days and I would like a means of supporting that directly.
Regards,
Ben
Ben,
Thanks for your work! Very much appreciated!
I've already successfully used lexertl at my previous job to replace this messy thing called flex. Now I'm thinking to also replace bison with parsertl (BTW -- Is this supposed to be Bavarian or Schwabian?) Does parsertl provide the same functionality lexertl provides of dumping out C code for an existing parser/lexer?Peter
Thanks for reporting this. It is fixed now.
Cheers.
seems there's a bug in the semantic actions example, at least checked with parsertl14. It is missing "parsertl/token.hpp" and the declaration as "token::token_vector productions_;" seems correct.
Thanks.
Thanks for the feedback!
Apologies for the delayed response.
I originally wanted to use the boost spirit Qi & LEX frame work for parsing.
I noticed the underlying Spirit.Lex was built on top of the lexertl
library.
It is easy to write grammar using the Lexertl/Parsertl and also easy to integrate with the C++ code without using lex/yacc.
we are able to use both lexertl
/parsertl
without any issues.
We slightly modified the library to invoke the custom callback functions.
Yes, once generated the parsertl state_machine
should be treated
as read only. It would therefore would be no problem to share a common sm amongst
multiple custom parser instances.
I would be interested to hear how you get on and how you find parsertl
to use in general!
We currently use the parsertl
library to parse some of our
domain specific scripts. We create multiple instances of the
parsertl::parser
struct to parse the scripts which have the same
parser/lexer rules (grammar). I assume the member variable parser::sm
(state_machine) would be the same in the all parser structs that use the same
parser/lexer rules and is used read-only during parsing after building it using
the function call parsertl::generator::build(parserRules, parser.sm)
.
Can I share the instance of state_machine
across the multiple
parser objects with the same parser/lexer rules? I plan to implement a new
parser class based on parsertl::parser
to share the
state_machine
object. Please let me know your thoughts.
Hi Sudhakar B
There is no built in $$
function yet. I'm thinking that the caller
can maintain that state externally as required, although I'm unclear if
this is the right way to go. So far I've done interfaces for lookup where
you only pay for what you use.
Hi Ben
Thanks for the great work. I just started using parsertl
&
lexertl
. Is there any equivalent dollar function to get the
pseudo variable $$
?
Thanks.