Commit a85007cb0d9bb8af8f0a32bda3ace19aaff97816

Authored by Jay Berkenbilt
1 parent a1d5a3e9

Handle more broken files

Space rather than newline after xref, missing /ID in trailer for
encrypted file.  This enables qpdf to handle some files that xpdf can
handle.  Adobe reader can't necessarily handle them.
ChangeLog
  1 +2013-06-15 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * Handle some additional broken files with missing /ID in trailer
  4 + for encrypted files and with space rather than newline after xref.
  5 +
1 2013-06-14 Jay Berkenbilt <ejb@ql.org> 6 2013-06-14 Jay Berkenbilt <ejb@ql.org>
2 7
3 * Detect and correct /Outlines dictionary being a direct object 8 * Detect and correct /Outlines dictionary being a direct object
libqpdf/QPDF.cc
@@ -430,11 +430,22 @@ QPDF::read_xref(qpdf_offset_t xref_offset) @@ -430,11 +430,22 @@ QPDF::read_xref(qpdf_offset_t xref_offset)
430 std::map<int, int> free_table; 430 std::map<int, int> free_table;
431 while (xref_offset) 431 while (xref_offset)
432 { 432 {
  433 + char buf[7];
  434 + memset(buf, 0, sizeof(buf));
433 this->file->seek(xref_offset, SEEK_SET); 435 this->file->seek(xref_offset, SEEK_SET);
434 - std::string line = this->file->readLine(50);  
435 - if (line == "xref") 436 + this->file->read(buf, sizeof(buf) - 1);
  437 + // The PDF spec says xref must be followed by a line
  438 + // terminator, but files exist in the wild where it is
  439 + // terminated by arbitrary whitespace.
  440 + PCRE xref_re("^xref\\s+");
  441 + PCRE::Match m = xref_re.match(buf);
  442 + if (m)
436 { 443 {
437 - xref_offset = read_xrefTable(this->file->tell()); 444 + QTC::TC("qpdf", "QPDF xref space",
  445 + ((buf[4] == '\n') ? 0 :
  446 + (buf[4] == '\r') ? 1 :
  447 + (buf[4] == ' ') ? 2 : 9999));
  448 + xref_offset = read_xrefTable(xref_offset + m.getMatch(0).length());
438 } 449 }
439 else 450 else
440 { 451 {
libqpdf/QPDF_encryption.cc
@@ -791,17 +791,24 @@ QPDF::initializeEncryption() @@ -791,17 +791,24 @@ QPDF::initializeEncryption()
791 // encryption dictionary. 791 // encryption dictionary.
792 this->encrypted = true; 792 this->encrypted = true;
793 793
  794 + std::string id1;
794 QPDFObjectHandle id_obj = this->trailer.getKey("/ID"); 795 QPDFObjectHandle id_obj = this->trailer.getKey("/ID");
795 - if (! (id_obj.isArray() &&  
796 - (id_obj.getArrayNItems() == 2) &&  
797 - id_obj.getArrayItem(0).isString())) 796 + if ((id_obj.isArray() &&
  797 + (id_obj.getArrayNItems() == 2) &&
  798 + id_obj.getArrayItem(0).isString()))
798 { 799 {
799 - throw QPDFExc(qpdf_e_damaged_pdf, this->file->getName(),  
800 - "trailer", this->file->getLastOffset(),  
801 - "invalid /ID in trailer dictionary"); 800 + id1 = id_obj.getArrayItem(0).getStringValue();
  801 + }
  802 + else
  803 + {
  804 + // Treating a missing ID as the empty string enables qpdf to
  805 + // decrypt some invalid encrypted files with no /ID that
  806 + // poppler can read but Adobe Reader can't.
  807 + warn(QPDFExc(qpdf_e_damaged_pdf, this->file->getName(),
  808 + "trailer", this->file->getLastOffset(),
  809 + "invalid /ID in trailer dictionary"));
802 } 810 }
803 811
804 - std::string id1 = id_obj.getArrayItem(0).getStringValue();  
805 QPDFObjectHandle encryption_dict = this->trailer.getKey("/Encrypt"); 812 QPDFObjectHandle encryption_dict = this->trailer.getKey("/Encrypt");
806 if (! encryption_dict.isDictionary()) 813 if (! encryption_dict.isDictionary())
807 { 814 {
qpdf/qpdf.testcov
@@ -264,3 +264,4 @@ QPDFObjectHandle inline image token 0 @@ -264,3 +264,4 @@ QPDFObjectHandle inline image token 0
264 QPDF not caching overridden objstm object 0 264 QPDF not caching overridden objstm object 0
265 QPDFWriter original obj non-zero gen 0 265 QPDFWriter original obj non-zero gen 0
266 QPDF_optimization indirect outlines 0 266 QPDF_optimization indirect outlines 0
  267 +QPDF xref space 2
qpdf/qtest/qpdf.test
@@ -199,7 +199,7 @@ $td-&gt;runtest(&quot;remove page we don&#39;t have&quot;, @@ -199,7 +199,7 @@ $td-&gt;runtest(&quot;remove page we don&#39;t have&quot;,
199 show_ntests(); 199 show_ntests();
200 # ---------- 200 # ----------
201 $td->notify("--- Miscellaneous Tests ---"); 201 $td->notify("--- Miscellaneous Tests ---");
202 -$n_tests += 64; 202 +$n_tests += 65;
203 203
204 $td->runtest("qpdf version", 204 $td->runtest("qpdf version",
205 {$td->COMMAND => "qpdf --version"}, 205 {$td->COMMAND => "qpdf --version"},
@@ -509,6 +509,14 @@ $td-&gt;runtest(&quot;check file&quot;, @@ -509,6 +509,14 @@ $td-&gt;runtest(&quot;check file&quot;,
509 {$td->FILE => "a.pdf"}, 509 {$td->FILE => "a.pdf"},
510 {$td->FILE => "gen1.qdf"}); 510 {$td->FILE => "gen1.qdf"});
511 511
  512 +# This file, from a user, is missing /ID in its trailer even though it
  513 +# is encrypted and also has a space instead of a newline after its
  514 +# xref keyword. xpdf can open it, but Adobe reader can't.
  515 +$td->runtest("check broken file",
  516 + {$td->COMMAND => "qpdf --check invalid-id-xref.pdf"},
  517 + {$td->FILE => "invalid-id-xref.out", $td->EXIT_STATUS => 3},
  518 + $td->NORMALIZE_NEWLINES);
  519 +
512 show_ntests(); 520 show_ntests();
513 # ---------- 521 # ----------
514 $td->notify("--- Numeric range parsing tests ---"); 522 $td->notify("--- Numeric range parsing tests ---");
qpdf/qtest/qpdf/invalid-id-xref.out 0 → 100644
  1 +WARNING: invalid-id-xref.pdf (trailer, file position 2493795): invalid /ID in trailer dictionary
  2 +checking invalid-id-xref.pdf
  3 +PDF Version: 1.1
  4 +R = 3
  5 +P = -1804
  6 +User password =
  7 +extract for accessibility: not allowed
  8 +extract for any purpose: allowed
  9 +print low resolution: allowed
  10 +print high resolution: allowed
  11 +modify document assembly: not allowed
  12 +modify forms: not allowed
  13 +modify annotations: allowed
  14 +modify other: not allowed
  15 +modify anything: not allowed
  16 +File is not linearized
qpdf/qtest/qpdf/invalid-id-xref.pdf 0 → 100644
No preview for this file type