Commit 0fe8d4476205c97e402e555aac41a88e70e3e9b2

Authored by Jay Berkenbilt
1 parent 63c7eefe

Support stream data -- not tested

There are no automated tests yet, but committing work so far in
preparation for some refactoring.
... ... @@ -54,14 +54,14 @@ Soon: Break ground on "Document-level work"
54 54 Output JSON v2
55 55 ==============
56 56  
57   -XXX
58   -
59 57 * Reread from perspective of update
60 58 * Test all ignore cases with QTC
61 59 * Test case of correct file with dict before data/datafile
62 60 * Have a test case if possible that exercises the object description
63 61 which means we need some kind of semantic error that gets caught
64 62 after creation.
  63 +* Test invalid data, invalid data file
  64 +* Tests: round-trip through json, round-trip through qpdf --qdf
65 65  
66 66 Try to never flatten pages tree. Make sure we do something reasonable
67 67 with pages tree repair. The problem is that if pages tree repair is
... ... @@ -236,6 +236,11 @@ Other documentation fodder:
236 236  
237 237 You can't create a PDF from v1 json because
238 238  
  239 +* Change: names are written in canonical form with a leading slash
  240 + just as they are treated in the code. In v1, they were written in
  241 + PDF syntax in the json file. Example: /text#2fplain in pdf will be
  242 + written as /text/plain in json v2 and as /text#2fplain in json v1.
  243 +
239 244 * The PDF version header is not recorded
240 245  
241 246 * Strings cannot be unambiguously encoded/decoded
... ...
include/qpdf/QPDF.hh
... ... @@ -998,7 +998,8 @@ class QPDF
998 998 class JSONReactor: public JSON::Reactor
999 999 {
1000 1000 public:
1001   - JSONReactor(QPDF&, std::string const& filename, bool must_be_complete);
  1001 + JSONReactor(
  1002 + QPDF&, std::shared_ptr<InputSource> is, bool must_be_complete);
1002 1003 virtual ~JSONReactor() = default;
1003 1004 virtual void dictionaryStart() override;
1004 1005 virtual void arrayStart() override;
... ... @@ -1033,7 +1034,7 @@ class QPDF
1033 1034 QPDFObjectHandle to_replace, QPDFObjectHandle replacement);
1034 1035  
1035 1036 QPDF& pdf;
1036   - std::string filename;
  1037 + std::shared_ptr<InputSource> is;
1037 1038 bool must_be_complete;
1038 1039 bool errors;
1039 1040 bool parse_error;
... ...
libqpdf/QPDF_Dictionary.cc
... ... @@ -37,9 +37,10 @@ QPDF_Dictionary::getJSON(int json_version)
37 37 JSON j = JSON::makeDictionary();
38 38 for (auto& iter: this->items) {
39 39 if (!iter.second.isNull()) {
40   - j.addDictionaryMember(
41   - QPDF_Name::normalizeName(iter.first),
42   - iter.second.getJSON(json_version));
  40 + std::string key =
  41 + (json_version == 1 ? QPDF_Name::normalizeName(iter.first)
  42 + : iter.first);
  43 + j.addDictionaryMember(key, iter.second.getJSON(json_version));
43 44 }
44 45 }
45 46 return j;
... ...
libqpdf/QPDF_Name.cc
... ... @@ -42,7 +42,11 @@ QPDF_Name::unparse()
42 42 JSON
43 43 QPDF_Name::getJSON(int json_version)
44 44 {
45   - return JSON::makeString(normalizeName(this->name));
  45 + if (json_version == 1) {
  46 + return JSON::makeString(normalizeName(this->name));
  47 + } else {
  48 + return JSON::makeString(this->name);
  49 + }
46 50 }
47 51  
48 52 QPDFObject::object_type_e
... ...
libqpdf/QPDF_json.cc
1 1 #include <qpdf/QPDF.hh>
2 2  
3 3 #include <qpdf/FileInputSource.hh>
  4 +#include <qpdf/Pl_Base64.hh>
4 5 #include <qpdf/QIntC.hh>
5 6 #include <qpdf/QTC.hh>
6 7 #include <qpdf/QUtil.hh>
  8 +#include <algorithm>
7 9 #include <regex>
8 10  
9 11 // This chart shows an example of the state transitions that would
... ... @@ -52,17 +54,40 @@ static char const* JSON_PDF = (
52 54 "9\n"
53 55 "%%EOF\n");
54 56  
  57 +// Note use of [\\s\\S] rather than . to match any character since .
  58 +// doesn't match newlines.
55 59 static std::regex PDF_VERSION_RE("^\\d+\\.\\d+$");
56 60 static std::regex OBJ_KEY_RE("^obj:(\\d+) (\\d+) R$");
57 61 static std::regex INDIRECT_OBJ_RE("^(\\d+) (\\d+) R$");
58   -static std::regex UNICODE_RE("^u:(.*)$");
  62 +static std::regex UNICODE_RE("^u:([\\s\\S]*)$");
59 63 static std::regex BINARY_RE("^b:((?:[0-9a-fA-F]{2})*)$");
60   -static std::regex NAME_RE("^/.*$");
  64 +static std::regex NAME_RE("^/[\\s\\S]*$");
  65 +
  66 +static std::function<void(Pipeline*)>
  67 +provide_data(std::shared_ptr<InputSource> is, size_t start, size_t end)
  68 +{
  69 + return [is, start, end](Pipeline* p) {
  70 + Pl_Base64 decode("base64-decode", p, Pl_Base64::a_decode);
  71 + p = &decode;
  72 + size_t bytes = end - start;
  73 + char buf[8192];
  74 + is->seek(QIntC::to_offset(start), SEEK_SET);
  75 + size_t len = 0;
  76 + while ((len = is->read(buf, std::min(bytes, sizeof(buf)))) > 0) {
  77 + p->write(buf, len);
  78 + bytes -= len;
  79 + if (bytes == 0) {
  80 + break;
  81 + }
  82 + }
  83 + decode.finish();
  84 + };
  85 +}
61 86  
62 87 QPDF::JSONReactor::JSONReactor(
63   - QPDF& pdf, std::string const& filename, bool must_be_complete) :
  88 + QPDF& pdf, std::shared_ptr<InputSource> is, bool must_be_complete) :
64 89 pdf(pdf),
65   - filename(filename),
  90 + is(is),
66 91 must_be_complete(must_be_complete),
67 92 errors(false),
68 93 parse_error(false),
... ... @@ -334,8 +359,6 @@ QPDF::JSONReactor::dictionaryItem(std::string const&amp; key, JSON const&amp; value)
334 359 replacement =
335 360 pdf.reserveStream(tos.getObjectID(), tos.getGeneration());
336 361 replaceObject(tos, replacement);
337   - replacement.replaceStreamData(
338   - "", "<<>>"_qpdf, "<<>>"_qpdf); // QXXXQ
339 362 }
340 363 } else {
341 364 // Ignore unknown keys for forward compatibility
... ... @@ -369,6 +392,7 @@ QPDF::JSONReactor::dictionaryItem(std::string const&amp; key, JSON const&amp; value)
369 392 throw std::logic_error("no object on stack in st_stream");
370 393 }
371 394 auto tos = object_stack.back();
  395 + auto uninitialized = QPDFObjectHandle();
372 396 if (!tos.isStream()) {
373 397 // QXXXQ QTC in update mode
374 398 error(value.getStart(), "this object is not a stream");
... ... @@ -388,10 +412,33 @@ QPDF::JSONReactor::dictionaryItem(std::string const&amp; key, JSON const&amp; value)
388 412 }
389 413 } else if (key == "data") {
390 414 this->saw_data = true;
391   - // QXXXQ
  415 + std::string v;
  416 + if (!value.getString(v)) {
  417 + error(value.getStart(), "\"stream.data\" must be a string");
  418 + } else {
  419 + // The range includes the quotes.
  420 + auto start = value.getStart() + 1;
  421 + auto end = value.getEnd() - 1;
  422 + if (end < start) {
  423 + throw std::logic_error("QPDF_json: JSON string length < 0");
  424 + }
  425 + tos.replaceStreamData(
  426 + provide_data(is, start, end), uninitialized, uninitialized);
  427 + }
392 428 } else if (key == "datafile") {
393 429 this->saw_datafile = true;
394   - // QXXXQ
  430 + std::string filename;
  431 + if (value.getString(filename)) {
  432 + tos.replaceStreamData(
  433 + QUtil::file_provider(filename),
  434 + uninitialized,
  435 + uninitialized);
  436 + } else {
  437 + error(
  438 + value.getStart(),
  439 + "\"stream.datafile\" must be a string containing a file "
  440 + "name");
  441 + }
395 442 } else {
396 443 // Ignore unknown keys for forward compatibility.
397 444 // QXXXQ QTC
... ... @@ -471,7 +518,8 @@ QPDF::JSONReactor::makeObject(JSON const&amp; value)
471 518 // QXXXQ include object number in description
472 519 result.setObjectDescription(
473 520 &this->pdf,
474   - this->filename + " offset " + QUtil::uint_to_string(value.getStart()));
  521 + this->is->getName() + " offset " +
  522 + QUtil::uint_to_string(value.getStart()));
475 523 return result;
476 524 }
477 525  
... ... @@ -503,7 +551,7 @@ QPDF::updateFromJSON(std::shared_ptr&lt;InputSource&gt; is)
503 551 void
504 552 QPDF::importJSON(std::shared_ptr<InputSource> is, bool must_be_complete)
505 553 {
506   - JSONReactor reactor(*this, is->getName(), must_be_complete);
  554 + JSONReactor reactor(*this, is, must_be_complete);
507 555 try {
508 556 JSON::parse(*is, &reactor);
509 557 } catch (std::runtime_error& e) {
... ...