Text Processing Wisdom

Let's say you want to scan C or C++ header files and look for stray semicolons (semicolons following a terminating '}' of function body). What is the easiest way to do that?

This is my solution:

#include <include/parsertl/generator.hpp>
#include <iostream>
#include <include/lexertl/iterator.hpp>
#include <include/parsertl/search_iterator.hpp>
#include <include/lexertl/memory_file.hpp>

std::uint16_t g_stray_open = static_cast<std::uint16_t>(~0);
std::uint16_t g_stray_close = static_cast<std::uint16_t>(~0);
std::uint16_t g_stray_semi = static_cast<std::uint16_t>(~0);

void build_parser(lexertl::state_machine& lsm, parsertl::state_machine& psm)
{
	parsertl::rules grules;
	lexertl::rules lrules;

	grules.token("'}' ';' Char Keyword Name String");

	grules.push("start", "')' names '{'");
	grules.push("names", "%empty | names Name");
	g_stray_open = grules.token_id("'{'");
	g_stray_close = grules.token_id("'}'");
	g_stray_semi = grules.token_id("';'");
	parsertl::generator::build(grules, psm);

	lrules.clear();
	lrules.push(R"(\))", grules.token_id("')'"));
	lrules.push(R"(\{)", grules.token_id("'{'"));
	lrules.push(R"(\})", grules.token_id("'}'"));
	lrules.push(";", grules.token_id("';'"));
	lrules.push("class|enum|struct", grules.token_id("Keyword"));
	lrules.push(R"([A-Z_a-z]\w*)", grules.token_id("Name"));
	lrules.push(R"(#define\s*(.|\\\r?\n)*)", lrules.skip());
	lrules.push(R"(["]([^"\\]|\\.)*["])", grules.token_id("String"));
	lrules.push(R"('([^'\\]|\\.)+')", grules.token_id("Char"));
	lrules.push("[ \t\r\n]+|[/][/].*|[/][*](?s:.)*?[*][/]", lrules.skip());
	lexertl::generator::build(lrules, lsm);
}

int main(int argc, char* argv[])
{
	try
	{
		// Assuming header file to scan passed as first param
		lexertl::memory_file mf(argv[1]);
		const char* first = mf.data();
		const char* second = first + mf.size();
		lexertl::state_machine lsm;
		parsertl::state_machine psm;

		build_parser(lsm, psm);

		lexertl::citerator liter(first, second, lsm);
		parsertl::csearch_iterator iter(liter, psm);
		parsertl::csearch_iterator end;

		for (; iter != end; ++iter)
		{
			lexertl::citerator i(iter->front().front().second, second, lsm);
			int count = 1;

			for (; i->id && count; ++i)
			{
				if (i->id == g_stray_open)
					++count;
				else if (i->id == g_stray_close)
					--count;
			}

			if (i->id == g_stray_semi)
				std::cout << argv[1] +
					'(' + std::to_string(std::count(first,
						i->first, '\n') + 1) + "): "
					"Stray semi-colon.\n";
		}
	}
	catch (const std::exception& e)
	{
		std::cerr << e.what() << '\n';
	}

	return 0;
}

So, why not just use a regex to do this? Because we want to discard whitespace and comments appropriately and we would like to not be derailed by strings/character literals.

OK, but couldn't a lexer alone do the job?

Well, the sublety there is that by searching using a grammar we get a sliding window over our text for free which would be error prone to write by hand.

Note how we use the grammar merely to locate the start of our search and then we break out into a simple loop that counts curly braces. This makes for a good example of how we can combine techniques to give us the flexibility we need whilst also keeping the code to a minimum.