test_tokenizer.cc 8.74 KB
#include <qpdf/QPDFTokenizer.hh>
#include <qpdf/QUtil.hh>
#include <qpdf/FileInputSource.hh>
#include <qpdf/BufferInputSource.hh>
#include <qpdf/QPDF.hh>
#include <qpdf/Pl_Buffer.hh>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <iostream>

static char const* whoami = 0;

void usage()
{
    std::cerr << "Usage: " << whoami
              << " [-maxlen len | -no-ignorable] filename"
              << std::endl;
    exit(2);
}

class Finder: public InputSource::Finder
{
  public:
    Finder(PointerHolder<InputSource> is, std::string const& str) :
        is(is),
        str(str)
    {
    }
    virtual ~Finder()
    {
    }
    virtual bool check();

  private:
    PointerHolder<InputSource> is;
    std::string str;
};

bool
Finder::check()
{
    QPDFTokenizer tokenizer;
    QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true);
    qpdf_offset_t offset = this->is->tell();
    bool result = (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str));
    this->is->seek(offset - this->str.length(), SEEK_SET);
    return result;
}

static char const* tokenTypeName(QPDFTokenizer::token_type_e ttype)
{
    // Do this is a case statement instead of a lookup so the compiler
    // will warn if we miss any.
    switch (ttype)
    {
      case QPDFTokenizer::tt_bad:
        return "bad";
      case QPDFTokenizer::tt_array_close:
        return "array_close";
      case QPDFTokenizer::tt_array_open:
        return "array_open";
      case QPDFTokenizer::tt_brace_close:
        return "brace_close";
      case QPDFTokenizer::tt_brace_open:
        return "brace_open";
      case QPDFTokenizer::tt_dict_close:
        return "dict_close";
      case QPDFTokenizer::tt_dict_open:
        return "dict_open";
      case QPDFTokenizer::tt_integer:
        return "integer";
      case QPDFTokenizer::tt_name:
        return "name";
      case QPDFTokenizer::tt_real:
        return "real";
      case QPDFTokenizer::tt_string:
        return "string";
      case QPDFTokenizer::tt_null:
        return "null";
      case QPDFTokenizer::tt_bool:
        return "bool";
      case QPDFTokenizer::tt_word:
        return "word";
      case QPDFTokenizer::tt_eof:
        return "eof";
      case QPDFTokenizer::tt_space:
        return "space";
      case QPDFTokenizer::tt_comment:
        return "comment";
      case QPDFTokenizer::tt_inline_image:
        return "inline-image";
    }
    return 0;
}

static std::string
sanitize(std::string const& value)
{
    std::string result;
    for (std::string::const_iterator iter = value.begin(); iter != value.end();
         ++iter)
    {
        if ((*iter >= 32) && (*iter <= 126))
        {
            result.append(1, *iter);
        }
        else
        {
            result += "\\x" + QUtil::int_to_string_base(
                static_cast<unsigned char>(*iter), 16, 2);
        }
    }
    return result;
}

static void
try_skipping(QPDFTokenizer& tokenizer, PointerHolder<InputSource> is,
             size_t max_len, char const* what, Finder& f)
{
    std::cout << "skipping to " << what << std::endl;
    qpdf_offset_t offset = is->tell();
    if (! is->findFirst(what, offset, 0, f))
    {
        std::cout << what << " not found" << std::endl;
        is->seek(offset, SEEK_SET);
    }
}

static void
dump_tokens(PointerHolder<InputSource> is, std::string const& label,
            size_t max_len, bool include_ignorable,
            bool skip_streams, bool skip_inline_images)
{
    Finder f1(is, "endstream");
    std::cout << "--- BEGIN " << label << " ---" << std::endl;
    bool done = false;
    QPDFTokenizer tokenizer;
    tokenizer.allowEOF();
    if (include_ignorable)
    {
        tokenizer.includeIgnorable();
    }
    qpdf_offset_t inline_image_offset = 0;
    while (! done)
    {
        QPDFTokenizer::Token token =
            tokenizer.readToken(is, "test", true,
                                inline_image_offset ? 0 : max_len);
        if (inline_image_offset && (token.getType() == QPDFTokenizer::tt_bad))
        {
            std::cout << "EI not found; resuming normal scanning" << std::endl;
            is->seek(inline_image_offset, SEEK_SET);
            inline_image_offset = 0;
            continue;
        }
        inline_image_offset = 0;

        qpdf_offset_t offset = is->getLastOffset();
        std::cout << offset << ": "
                  << tokenTypeName(token.getType());
        if (token.getType() != QPDFTokenizer::tt_eof)
        {
            std::cout << ": "
                      << sanitize(token.getValue());
            if (token.getValue() != token.getRawValue())
            {
                std::cout << " (raw: " << sanitize(token.getRawValue()) << ")";
            }
        }
        if (token.getType() == QPDFTokenizer::tt_bad)
        {
            std::cout << " (" << token.getErrorMessage() << ")";
        }
        std::cout << std::endl;
        if (skip_streams &&
            (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "stream")))
        {
            try_skipping(tokenizer, is, max_len, "endstream", f1);
        }
        else if (skip_inline_images &&
                 (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "ID")))
        {
            tokenizer.expectInlineImage();
            inline_image_offset = is->tell();
        }
        else if (token.getType() == QPDFTokenizer::tt_eof)
        {
            done = true;
        }
    }
    std::cout << "--- END " << label << " ---" << std::endl;
}

static void process(char const* filename, bool include_ignorable,
                    size_t max_len)
{
    PointerHolder<InputSource> is;

    // Tokenize file, skipping streams
    FileInputSource* fis = new FileInputSource();
    fis->setFilename(filename);
    is = fis;
    dump_tokens(is, "FILE", max_len, include_ignorable, true, false);

    // Tokenize content streams, skipping inline images
    QPDF qpdf;
    qpdf.processFile(filename);
    std::vector<QPDFObjectHandle> pages = qpdf.getAllPages();
    int pageno = 0;
    for (std::vector<QPDFObjectHandle>::iterator iter = pages.begin();
         iter != pages.end(); ++iter)
    {
        ++pageno;
        Pl_Buffer plb("buffer");
        std::vector<QPDFObjectHandle> contents = (*iter).getPageContents();
        for (std::vector<QPDFObjectHandle>::iterator citer = contents.begin();
             citer != contents.end(); ++citer)
        {
            (*citer).pipeStreamData(&plb, 0, qpdf_dl_specialized);
        }
        plb.finish();
        PointerHolder<Buffer> content_data = plb.getBuffer();
        BufferInputSource* bis = new BufferInputSource(
            "content data", content_data.getPointer());
        is = bis;
        dump_tokens(is, "PAGE " + QUtil::int_to_string(pageno),
                    max_len, include_ignorable, false, true);
    }

    // Tokenize object streams
    std::vector<QPDFObjectHandle> all = qpdf.getAllObjects();
    for (std::vector<QPDFObjectHandle>::iterator iter = all.begin();
         iter != all.end(); ++iter)
    {
        if ((*iter).isStream() &&
            (*iter).getDict().getKey("/Type").isName() &&
            (*iter).getDict().getKey("/Type").getName() == "/ObjStm")
        {
            PointerHolder<Buffer> b =
                (*iter).getStreamData(qpdf_dl_specialized);
            BufferInputSource* bis = new BufferInputSource(
                "object stream data", b.getPointer());
            is = bis;
            dump_tokens(is, "OBJECT STREAM " +
                        QUtil::int_to_string((*iter).getObjectID()),
                        max_len, include_ignorable, false, false);
        }
    }
}

int main(int argc, char* argv[])
{
    QUtil::setLineBuf(stdout);
    if ((whoami = strrchr(argv[0], '/')) == NULL)
    {
	whoami = argv[0];
    }
    else
    {
	++whoami;
    }
    // For libtool's sake....
    if (strncmp(whoami, "lt-", 3) == 0)
    {
	whoami += 3;
    }

    char const* filename = 0;
    size_t max_len = 0;
    bool include_ignorable = true;
    for (int i = 1; i < argc; ++i)
    {
        if (argv[i][0] == '-')
        {
            if (strcmp(argv[i], "-maxlen") == 0)
            {
                if (++i >= argc)
                {
                    usage();
                }
                max_len = QUtil::string_to_int(argv[i]);
            }
            else if (strcmp(argv[i], "-no-ignorable") == 0)
            {
                include_ignorable = false;
            }
            else
            {
                usage();
            }
        }
        else if (filename)
        {
            usage();
        }
        else
        {
            filename = argv[i];
        }
    }
    if (filename == 0)
    {
        usage();
    }

    try
    {
        process(filename, include_ignorable, max_len);
    }
    catch (std::exception& e)
    {
        std::cerr << whoami << ": exception: " << e.what();
        exit(2);
    }
    return 0;
}