PCRE.cc 7.08 KB


#include <qpdf/PCRE.hh>
#include <qpdf/QUtil.hh>

#include <iostream>

PCRE::Exception::Exception(std::string const& message)
{
    this->setMessage("PCRE error: " + message);
}

PCRE::NoBackref::NoBackref() :
    Exception("no match")
{
}

PCRE::Match::Match(int nbackrefs, char const* subject)
{
    this->init(-1, nbackrefs, subject);
}

PCRE::Match::~Match()
{
    this->destroy();
}

PCRE::Match::Match(Match const& rhs)
{
    this->copy(rhs);
}

PCRE::Match&
PCRE::Match::operator=(Match const& rhs)
{
    if (this != &rhs)
    {
	this->destroy();
	this->copy(rhs);
    }
    return *this;
}

void
PCRE::Match::init(int nmatches, int nbackrefs, char const* subject)
{
    this->nmatches = nmatches;
    this->nbackrefs = nbackrefs;
    this->subject = subject;
    this->ovecsize = 3 * (1 + nbackrefs);
    this->ovector = 0;
    if (this->ovecsize)
    {
	this->ovector = new int[this->ovecsize];
    }
}

void
PCRE::Match::copy(Match const& rhs)
{
    this->init(rhs.nmatches, rhs.nbackrefs, rhs.subject);
    int i;
    for (i = 0; i < this->ovecsize; ++i)
    {
	this->ovector[i] = rhs.ovector[i];
    }
}

void
PCRE::Match::destroy()
{
    delete [] this->ovector;
}

PCRE::Match::operator bool()
{
    return (this->nmatches >= 0);
}


std::string
PCRE::Match::getMatch(int n, int flags)
    throw(QEXC::General, Exception)
{
    // This method used to be implemented in terms of
    // pcre_get_substring, but that function gives you an empty string
    // for an unmatched backreference that is in range.

    int offset;
    int length;
    try
    {
	getOffsetLength(n, offset, length);
    }
    catch (NoBackref&)
    {
	if (flags & gm_no_substring_returns_empty)
	{
	    return "";
	}
	else
	{
	    throw;
	}
    }

    return std::string(this->subject).substr(offset, length);
}

void
PCRE::Match::getOffsetLength(int n, int& offset, int& length) throw(Exception)
{
    if ((this->nmatches < 0) ||
	(n > this->nmatches - 1) ||
	(this->ovector[n * 2] == -1))
    {
	throw NoBackref();
    }
    offset = this->ovector[n * 2];
    length = this->ovector[n * 2 + 1] - offset;
}


int
PCRE::Match::getOffset(int n) throw(Exception)
{
    int offset;
    int length;
    this->getOffsetLength(n, offset, length);
    return offset;
}


int
PCRE::Match::getLength(int n) throw(Exception)
{
    int offset;
    int length;
    this->getOffsetLength(n, offset, length);
    return length;
}


int
PCRE::Match::nMatches() const
{
    return this->nmatches;
}

PCRE::PCRE(char const* pattern, int options) throw (Exception)
{
    char const *errptr;
    int erroffset;
    this->code = pcre_compile(pattern, options, &errptr, &erroffset, 0);
    if (this->code)
    {
	this->nbackrefs = pcre_info(this->code, 0, 0);
    }
    else
    {
	std::string message = (std::string("compilation of ") + pattern +
			  " failed at offset " +
			  QUtil::int_to_string(erroffset) + ": " +
			  errptr);
	throw Exception(message);
    }
}

PCRE::~PCRE()
{
    pcre_free(this->code);
}

PCRE::Match
PCRE::match(char const* subject, int options, int startoffset, int size)
    throw (QEXC::General, Exception)
{
    if (size == -1)
    {
	size = strlen(subject);
    }

    Match result(this->nbackrefs, subject);
    int status = pcre_exec(this->code, 0, subject, size,
			   startoffset, options,
			   result.ovector, result.ovecsize);
    if (status >= 0)
    {
	result.nmatches = status;
    }
    else
    {
	std::string message;

	switch (status)
	{
	  case PCRE_ERROR_NOMATCH:
	    break;

	  case PCRE_ERROR_BADOPTION:
	    message = "bad option passed to PCRE::match()";
	    throw Exception(message);
	    break;

	  case PCRE_ERROR_NOMEMORY:
	    message = "insufficient memory";
	    throw Exception(message);
	    break;

	  case PCRE_ERROR_NULL:
	  case PCRE_ERROR_BADMAGIC:
	  case PCRE_ERROR_UNKNOWN_NODE:
	  default:
	    message = "pcre_exec returned " + QUtil::int_to_string(status);
	    throw QEXC::Internal(message);
	}
    }

    return result;
}

void
PCRE::test(int n)
{
    try
    {
	if (n == 1)
	{
	    static char const* utf8 = "abπdefq";
	    PCRE u1("^([[:alpha:]]+)");
	    PCRE u2("^([\\p{L}]+)", PCRE_UTF8);
	    PCRE::Match m1 = u1.match(utf8);
	    if (m1)
	    {
		std::cout << "no utf8: " << m1.getMatch(1) << std::endl;
	    }
	    PCRE::Match m2 = u2.match(utf8);
	    if (m2)
	    {
		std::cout << "utf8: " << m2.getMatch(1) << std::endl;
	    }
	    return;
	}

	try
	{
	    PCRE pcre1("a**");
	}
	catch (Exception& e)
	{
	    std::cout << e.unparse() << std::endl;
	}

	PCRE pcre2("^([^\\s:]*)\\s*:\\s*(.*?)\\s*$");
	PCRE::Match m2 = pcre2.match("key: value one two three ");
	if (m2)
	{
	    std::cout << m2.nMatches() << std::endl;
	    std::cout << m2.getMatch(0) << std::endl;
	    std::cout << m2.getOffset(0) << std::endl;
	    std::cout << m2.getLength(0) << std::endl;
	    std::cout << m2.getMatch(1) << std::endl;
	    std::cout << m2.getOffset(1) << std::endl;
	    std::cout << m2.getLength(1) << std::endl;
	    std::cout << m2.getMatch(2) << std::endl;
	    std::cout << m2.getOffset(2) << std::endl;
	    std::cout << m2.getLength(2) << std::endl;
	    try
	    {
		std::cout << m2.getMatch(3) << std::endl;
	    }
	    catch (Exception& e)
	    {
		std::cout << e.unparse() << std::endl;
	    }
	    try
	    {
		std::cout << m2.getOffset(3) << std::endl;
	    }
	    catch (Exception& e)
	    {
		std::cout << e.unparse() << std::endl;
	    }
	}
	PCRE pcre3("^(a+)(b+)?$");
	PCRE::Match m3 = pcre3.match("aaa");
	try
	{
	    if (m3)
	    {
		std::cout << m3.nMatches() << std::endl;
		std::cout << m3.getMatch(0) << std::endl;
		std::cout << m3.getMatch(1) << std::endl;
		std::cout << "-"
			  << m3.getMatch(
			      2, Match::gm_no_substring_returns_empty)
			  << "-" << std::endl;
		std::cout << "hello" << std::endl;
		std::cout << m3.getMatch(2) << std::endl;
		std::cout << "can't see this" << std::endl;
	    }
	}
	catch (Exception& e)
	{
	    std::cout << e.unparse() << std::endl;
	}

	// backref: 1   2 3        4      5
	PCRE pcre4("^((?:(a(b)?)(?:,(c))?)|(c))?$");
	static char const* candidates[] = {
	    "qqqcqqq",		// no match
	    "ab,c",		// backrefs: 0, 1, 2, 3, 4
	    "ab",		// backrefs: 0, 1, 2, 3
	    "a",		// backrefs: 0, 1, 2
	    "a,c",		// backrefs: 0, 1, 2, 4
	    "c",		// backrefs: 0, 1, 5
	    "",			// backrefs: 0
	    0
	};
	for (char const** p = candidates; *p; ++p)
	{
	    PCRE::Match m(pcre4.match(*p));
	    if (m)
	    {
		int nmatches = m.nMatches();
		for (int i = 0; i < nmatches; ++i)
		{
		    std::cout << *p << ": " << i << ": ";
		    try
		    {
			std::string match = m.getMatch(i);
			std::cout << match;
		    }
		    catch (NoBackref&)
		    {
			std::cout << "no backref (getMatch)";
		    }
		    std::cout << std::endl;

		    std::cout << *p << ": " << i << ": ";
		    try
		    {
			int offset;
			int length;
			m.getOffsetLength(i, offset, length);
			std::cout << offset << ", " << length;
		    }
		    catch (NoBackref&)
		    {
			std::cout << "no backref (getOffsetLength)";
		    }
		    std:: cout << std::endl;
		}
	    }
	    else
	    {
		std::cout << *p << ": no match" << std::endl;
	    }
	}
    }
    catch (QEXC::General& e)
    {
	std::cout << "unexpected exception: " << e.unparse() << std::endl;
    }
}