Commit e4e2e26d990d038b0d35e7466c8a24dbfafab7d2

Authored by Jay Berkenbilt
1 parent 1a4dcb4a

Properly handle pages with no contents (fixes #194)

Remove calls to assertPageObject(). All cases in the library that
called assertPageObject() work fine if you don't call
assertPageObject() because nothing assumes anything that was being
checked by that call. Removing the calls enables more files to be
successfully processed.
ChangeLog
  1 +2018-03-06 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * Properly handle pages with no contents. Fixes #194.
  4 +
1 2018-03-05 Jay Berkenbilt <ejb@ql.org> 5 2018-03-05 Jay Berkenbilt <ejb@ql.org>
2 6
3 * Improve handling of loops while following cross reference 7 * Improve handling of loops while following cross reference
include/qpdf/QPDFObjectHandle.hh
@@ -830,6 +830,11 @@ class QPDFObjectHandle @@ -830,6 +830,11 @@ class QPDFObjectHandle
830 QPDF_DLL 830 QPDF_DLL
831 void assertNumber(); 831 void assertNumber();
832 832
  833 + // The isPageObject method checks the /Type key of the object.
  834 + // This is not completely reliable as there are some otherwise
  835 + // valid files whose /Type is wrong for page objects. qpdf is
  836 + // slightly more accepting but may still return false here when
  837 + // treating the object as a page would work. Use this sparingly.
833 QPDF_DLL 838 QPDF_DLL
834 bool isPageObject(); 839 bool isPageObject();
835 QPDF_DLL 840 QPDF_DLL
libqpdf/QPDFObjectHandle.cc
@@ -932,8 +932,6 @@ QPDFObjectHandle::getGeneration() const @@ -932,8 +932,6 @@ QPDFObjectHandle::getGeneration() const
932 std::map<std::string, QPDFObjectHandle> 932 std::map<std::string, QPDFObjectHandle>
933 QPDFObjectHandle::getPageImages() 933 QPDFObjectHandle::getPageImages()
934 { 934 {
935 - assertPageObject();  
936 -  
937 // Note: this code doesn't handle inherited resources. If this 935 // Note: this code doesn't handle inherited resources. If this
938 // page dictionary doesn't have a /Resources key or has one whose 936 // page dictionary doesn't have a /Resources key or has one whose
939 // value is null or an empty dictionary, you are supposed to walk 937 // value is null or an empty dictionary, you are supposed to walk
@@ -1081,7 +1079,6 @@ QPDFObjectHandle::addPageContents(QPDFObjectHandle new_contents, bool first) @@ -1081,7 +1079,6 @@ QPDFObjectHandle::addPageContents(QPDFObjectHandle new_contents, bool first)
1081 void 1079 void
1082 QPDFObjectHandle::rotatePage(int angle, bool relative) 1080 QPDFObjectHandle::rotatePage(int angle, bool relative)
1083 { 1081 {
1084 - assertPageObject();  
1085 if ((angle % 90) != 0) 1082 if ((angle % 90) != 0)
1086 { 1083 {
1087 throw std::runtime_error( 1084 throw std::runtime_error(
@@ -1137,7 +1134,6 @@ QPDFObjectHandle::rotatePage(int angle, bool relative) @@ -1137,7 +1134,6 @@ QPDFObjectHandle::rotatePage(int angle, bool relative)
1137 void 1134 void
1138 QPDFObjectHandle::coalesceContentStreams() 1135 QPDFObjectHandle::coalesceContentStreams()
1139 { 1136 {
1140 - assertPageObject();  
1141 QPDFObjectHandle contents = this->getKey("/Contents"); 1137 QPDFObjectHandle contents = this->getKey("/Contents");
1142 if (contents.isStream()) 1138 if (contents.isStream())
1143 { 1139 {
@@ -1218,7 +1214,6 @@ QPDFObjectHandle::parse(std::string const&amp; object_str, @@ -1218,7 +1214,6 @@ QPDFObjectHandle::parse(std::string const&amp; object_str,
1218 void 1214 void
1219 QPDFObjectHandle::pipePageContents(Pipeline* p) 1215 QPDFObjectHandle::pipePageContents(Pipeline* p)
1220 { 1216 {
1221 - assertPageObject();  
1222 std::string description = "page object " + 1217 std::string description = "page object " +
1223 QUtil::int_to_string(this->m->objid) + " " + 1218 QUtil::int_to_string(this->m->objid) + " " +
1224 QUtil::int_to_string(this->m->generation); 1219 QUtil::int_to_string(this->m->generation);
@@ -1256,7 +1251,6 @@ QPDFObjectHandle::pipeContentStreams( @@ -1256,7 +1251,6 @@ QPDFObjectHandle::pipeContentStreams(
1256 void 1251 void
1257 QPDFObjectHandle::parsePageContents(ParserCallbacks* callbacks) 1252 QPDFObjectHandle::parsePageContents(ParserCallbacks* callbacks)
1258 { 1253 {
1259 - assertPageObject();  
1260 std::string description = "page object " + 1254 std::string description = "page object " +
1261 QUtil::int_to_string(this->m->objid) + " " + 1255 QUtil::int_to_string(this->m->objid) + " " +
1262 QUtil::int_to_string(this->m->generation); 1256 QUtil::int_to_string(this->m->generation);
@@ -1267,7 +1261,6 @@ QPDFObjectHandle::parsePageContents(ParserCallbacks* callbacks) @@ -1267,7 +1261,6 @@ QPDFObjectHandle::parsePageContents(ParserCallbacks* callbacks)
1267 void 1261 void
1268 QPDFObjectHandle::filterPageContents(TokenFilter* filter, Pipeline* next) 1262 QPDFObjectHandle::filterPageContents(TokenFilter* filter, Pipeline* next)
1269 { 1263 {
1270 - assertPageObject();  
1271 std::string description = "token filter for page object " + 1264 std::string description = "token filter for page object " +
1272 QUtil::int_to_string(this->m->objid) + " " + 1265 QUtil::int_to_string(this->m->objid) + " " +
1273 QUtil::int_to_string(this->m->generation); 1266 QUtil::int_to_string(this->m->generation);
@@ -2222,8 +2215,29 @@ QPDFObjectHandle::assertNumber() @@ -2222,8 +2215,29 @@ QPDFObjectHandle::assertNumber()
2222 bool 2215 bool
2223 QPDFObjectHandle::isPageObject() 2216 QPDFObjectHandle::isPageObject()
2224 { 2217 {
2225 - // Some PDF files have /Type broken on pages.  
2226 - return (this->isDictionary() && this->hasKey("/Contents")); 2218 + // See comments in QPDFObjectHandle.hh.
  2219 + if (! this->isDictionary())
  2220 + {
  2221 + return false;
  2222 + }
  2223 + if (this->hasKey("/Type"))
  2224 + {
  2225 + QPDFObjectHandle type = this->getKey("/Type");
  2226 + if (type.isName() && (type.getName() == "/Page"))
  2227 + {
  2228 + return true;
  2229 + }
  2230 + // Files have been seen in the wild that have /Type (Page)
  2231 + if (type.isString() && (type.getStringValue() == "Page"))
  2232 + {
  2233 + return true;
  2234 + }
  2235 + }
  2236 + if (this->hasKey("/Contents"))
  2237 + {
  2238 + return true;
  2239 + }
  2240 + return false;
2227 } 2241 }
2228 2242
2229 bool 2243 bool
libqpdf/QPDF_pages.cc
@@ -191,7 +191,6 @@ QPDF::insertPage(QPDFObjectHandle newpage, int pos) @@ -191,7 +191,6 @@ QPDF::insertPage(QPDFObjectHandle newpage, int pos)
191 // pos = npages adds to the end. 191 // pos = npages adds to the end.
192 192
193 flattenPagesTree(); 193 flattenPagesTree();
194 - newpage.assertPageObject();  
195 194
196 if (! newpage.isIndirect()) 195 if (! newpage.isIndirect())
197 { 196 {
@@ -288,7 +287,6 @@ QPDF::addPage(QPDFObjectHandle newpage, bool first) @@ -288,7 +287,6 @@ QPDF::addPage(QPDFObjectHandle newpage, bool first)
288 int 287 int
289 QPDF::findPage(QPDFObjectHandle& page) 288 QPDF::findPage(QPDFObjectHandle& page)
290 { 289 {
291 - page.assertPageObject();  
292 return findPage(page.getObjGen()); 290 return findPage(page.getObjGen());
293 } 291 }
294 292
qpdf/qtest/qpdf.test
@@ -921,6 +921,34 @@ $td-&gt;runtest(&quot;check output&quot;, @@ -921,6 +921,34 @@ $td-&gt;runtest(&quot;check output&quot;,
921 921
922 show_ntests(); 922 show_ntests();
923 # ---------- 923 # ----------
  924 +$td->notify("--- Page with no contents ---");
  925 +$n_tests += 7;
  926 +
  927 +$td->runtest("check no contents",
  928 + {$td->COMMAND => "qpdf --check no-contents.pdf"},
  929 + {$td->FILE => "no-contents-check.out", $td->EXIT_STATUS => 0},
  930 + $td->NORMALIZE_NEWLINES);
  931 +
  932 +foreach my $arg ('--qdf', '--coalesce-contents', '')
  933 +{
  934 + $td->runtest("convert no contents ($arg)",
  935 + {$td->COMMAND =>
  936 + "qpdf $arg --static-id no-contents.pdf a.pdf"},
  937 + {$td->STRING => "", $td->EXIT_STATUS => 0});
  938 +
  939 + my $suf = $arg;
  940 + $suf =~ s/--//;
  941 + if ($suf eq '')
  942 + {
  943 + $suf = "none";
  944 + }
  945 + $td->runtest("check output",
  946 + {$td->FILE => "a.pdf"},
  947 + {$td->FILE => "no-contents-$suf.pdf"});
  948 +}
  949 +
  950 +show_ntests();
  951 +# ----------
924 $td->notify("--- Token filters ---"); 952 $td->notify("--- Token filters ---");
925 $n_tests += 2; 953 $n_tests += 2;
926 954
qpdf/qtest/qpdf/no-contents-check.out 0 → 100644
  1 +checking no-contents.pdf
  2 +PDF Version: 1.3
  3 +File is not encrypted
  4 +File is not linearized
  5 +No syntax or stream encoding errors found; the file may still contain
  6 +errors that qpdf cannot detect
qpdf/qtest/qpdf/no-contents-coalesce-contents.pdf 0 → 100644
  1 +%PDF-1.3
  2 +%¿÷¢þ
  3 +1 0 obj
  4 +<< /Pages 2 0 R /Type /Catalog >>
  5 +endobj
  6 +2 0 obj
  7 +<< /Count 1 /Kids [ 3 0 R ] /Type /Pages >>
  8 +endobj
  9 +3 0 obj
  10 +<< /Contents 4 0 R /MediaBox [ 0 0 720 720 ] /Parent 2 0 R /Resources << >> /Type /Page >>
  11 +endobj
  12 +4 0 obj
  13 +<< /Length 0 /Filter /FlateDecode >>
  14 +stream
  15 +endstream
  16 +endobj
  17 +xref
  18 +0 5
  19 +0000000000 65535 f
  20 +0000000015 00000 n
  21 +0000000064 00000 n
  22 +0000000123 00000 n
  23 +0000000229 00000 n
  24 +trailer << /Root 1 0 R /Size 5 /ID [<52bba3c78160d0c6e851b59110e5d076><31415926535897932384626433832795>] >>
  25 +startxref
  26 +298
  27 +%%EOF
qpdf/qtest/qpdf/no-contents-none.pdf 0 → 100644
  1 +%PDF-1.3
  2 +%¿÷¢þ
  3 +1 0 obj
  4 +<< /Pages 2 0 R /Type /Catalog >>
  5 +endobj
  6 +2 0 obj
  7 +<< /Count 1 /Kids [ 3 0 R ] /Type /Pages >>
  8 +endobj
  9 +3 0 obj
  10 +<< /MediaBox [ 0 0 720 720 ] /Parent 2 0 R /Resources << >> /Type /Page >>
  11 +endobj
  12 +xref
  13 +0 4
  14 +0000000000 65535 f
  15 +0000000015 00000 n
  16 +0000000064 00000 n
  17 +0000000123 00000 n
  18 +trailer << /Root 1 0 R /Size 4 /ID [<52bba3c78160d0c6e851b59110e5d076><31415926535897932384626433832795>] >>
  19 +startxref
  20 +213
  21 +%%EOF
qpdf/qtest/qpdf/no-contents-qdf.pdf 0 → 100644
  1 +%PDF-1.3
  2 +%¿÷¢þ
  3 +%QDF-1.0
  4 +
  5 +%% Original object ID: 1 0
  6 +1 0 obj
  7 +<<
  8 + /Pages 2 0 R
  9 + /Type /Catalog
  10 +>>
  11 +endobj
  12 +
  13 +%% Original object ID: 2 0
  14 +2 0 obj
  15 +<<
  16 + /Count 1
  17 + /Kids [
  18 + 3 0 R
  19 + ]
  20 + /Type /Pages
  21 +>>
  22 +endobj
  23 +
  24 +%% Page 1
  25 +%% Original object ID: 3 0
  26 +3 0 obj
  27 +<<
  28 + /MediaBox [
  29 + 0
  30 + 0
  31 + 720
  32 + 720
  33 + ]
  34 + /Parent 2 0 R
  35 + /Resources <<
  36 + >>
  37 + /Type /Page
  38 +>>
  39 +endobj
  40 +
  41 +xref
  42 +0 4
  43 +0000000000 65535 f
  44 +0000000052 00000 n
  45 +0000000133 00000 n
  46 +0000000242 00000 n
  47 +trailer <<
  48 + /Root 1 0 R
  49 + /Size 4
  50 + /ID [<52bba3c78160d0c6e851b59110e5d076><31415926535897932384626433832795>]
  51 +>>
  52 +startxref
  53 +361
  54 +%%EOF
qpdf/qtest/qpdf/no-contents.pdf 0 → 100644
  1 +%PDF-1.3
  2 +%¿÷¢þ
  3 +1 0 obj
  4 +<< /Pages 2 0 R /Type /Catalog >>
  5 +endobj
  6 +2 0 obj
  7 +<< /Count 1 /Kids [ 3 0 R ] /Type /Pages >>
  8 +endobj
  9 +3 0 obj
  10 +<< /MediaBox [ 0 0 720 720 ] /Parent 2 0 R /Resources << >> /Type /Page >>
  11 +endobj
  12 +xref
  13 +0 4
  14 +0000000000 65535 f
  15 +0000000015 00000 n
  16 +0000000064 00000 n
  17 +0000000123 00000 n
  18 +trailer << /Root 1 0 R /Size 4 /ID [<52bba3c78160d0c6e851b59110e5d076><52bba3c78160d0c6e851b59110e5d076>] >>
  19 +startxref
  20 +213
  21 +%%EOF