Commit a01359189b32c60c2d55b039f7aefd6c3ce0ebde

Authored by Jay Berkenbilt
1 parent 158156d5

Fix dangling references (fixes #240)

On certain operations, such as iterating through all objects and
adding new indirect objects, walk through the entire object structure
and explicitly resolve any indirect references to non-existent
objects. That prevents new objects from springing into existence and
causing the previously dangling references to point to them.
ChangeLog
  1 +2019-01-04 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * Detect and recover from dangling references. If a PDF file
  4 + contained an indirect reference to a non-existent object (which is
  5 + valid), when adding a new object to the file, it was possible for
  6 + the new object to take the object ID of the dangling reference,
  7 + thereby causing the dangling reference to point to the new object.
  8 + This case is now prevented. Fixes #240.
  9 +
1 10 2019-01-03 Jay Berkenbilt <ejb@ql.org>
2 11  
3 12 * Fix behavior of form field value setting to handle the following
... ...
include/qpdf/QPDF.hh
... ... @@ -431,9 +431,21 @@ class QPDF
431 431 QPDF_DLL
432 432 void showXRefTable();
433 433  
  434 + // Detect all indirect references to objects that don't exist and
  435 + // resolve them by replacing them with null, which is how the PDF
  436 + // spec says to interpret such dangling references. This method is
  437 + // called automatically if you try to add any new objects, if you
  438 + // call getAllObjects, and before a file is written. The qpdf
  439 + // object caches whether it has run this to avoid running it
  440 + // multiple times. You can pass true to force it to run again if
  441 + // you have explicitly added new objects that may have additional
  442 + // dangling references.
  443 + QPDF_DLL
  444 + void fixDanglingReferences(bool force = false);
  445 +
434 446 // Return the approximate number of indirect objects. It is
435 447 // approximate because not all objects in the file are preserved
436   - // in all cases.
  448 + // in all cases, and gaps in object numbering are not preserved.
437 449 QPDF_DLL
438 450 size_t getObjectCount();
439 451  
... ... @@ -1199,6 +1211,7 @@ class QPDF
1199 1211 CopiedStreamDataProvider* copied_stream_data_provider;
1200 1212 std::set<QPDFObjGen> attachment_streams;
1201 1213 bool reconstructed_xref;
  1214 + bool fixed_dangling_refs;
1202 1215  
1203 1216 // Linearization data
1204 1217 qpdf_offset_t first_xref_item_offset; // actual value from file
... ...
libqpdf/QPDF.cc
... ... @@ -94,6 +94,7 @@ QPDF::Members::Members() :
94 94 pushed_inherited_attributes_to_pages(false),
95 95 copied_stream_data_provider(0),
96 96 reconstructed_xref(false),
  97 + fixed_dangling_refs(false),
97 98 first_xref_item_offset(0),
98 99 uncompressed_after_compressed(false)
99 100 {
... ... @@ -1218,33 +1219,129 @@ QPDF::showXRefTable()
1218 1219 }
1219 1220 }
1220 1221  
  1222 +void
  1223 +QPDF::fixDanglingReferences(bool force)
  1224 +{
  1225 + if (this->m->fixed_dangling_refs && (! force))
  1226 + {
  1227 + return;
  1228 + }
  1229 + this->m->fixed_dangling_refs = true;
  1230 +
  1231 + // Create a set of all known indirect objects including those
  1232 + // we've previously resolved and those that we have created.
  1233 + std::set<QPDFObjGen> to_process;
  1234 + for (std::map<QPDFObjGen, ObjCache>::iterator iter =
  1235 + this->m->obj_cache.begin();
  1236 + iter != this->m->obj_cache.end(); ++iter)
  1237 + {
  1238 + to_process.insert((*iter).first);
  1239 + }
  1240 + for (std::map<QPDFObjGen, QPDFXRefEntry>::iterator iter =
  1241 + this->m->xref_table.begin();
  1242 + iter != this->m->xref_table.end(); ++iter)
  1243 + {
  1244 + to_process.insert((*iter).first);
  1245 + }
  1246 +
  1247 + // For each non-scalar item to process, put it in the queue.
  1248 + std::list<QPDFObjectHandle> queue;
  1249 + queue.push_back(this->m->trailer);
  1250 + for (std::set<QPDFObjGen>::iterator iter = to_process.begin();
  1251 + iter != to_process.end(); ++iter)
  1252 + {
  1253 + QPDFObjectHandle obj = QPDFObjectHandle::Factory::newIndirect(
  1254 + this, (*iter).getObj(), (*iter).getGen());
  1255 + if (obj.isDictionary() || obj.isArray())
  1256 + {
  1257 + queue.push_back(obj);
  1258 + }
  1259 + else if (obj.isStream())
  1260 + {
  1261 + queue.push_back(obj.getDict());
  1262 + }
  1263 + }
  1264 +
  1265 + // Process the queue by recursively resolving all object
  1266 + // references. We don't need to do loop detection because we don't
  1267 + // traverse known indirect objects when processing the queue.
  1268 + while (! queue.empty())
  1269 + {
  1270 + QPDFObjectHandle obj = queue.front();
  1271 + queue.pop_front();
  1272 + std::list<QPDFObjectHandle> to_check;
  1273 + if (obj.isDictionary())
  1274 + {
  1275 + std::map<std::string, QPDFObjectHandle> members =
  1276 + obj.getDictAsMap();
  1277 + for (std::map<std::string, QPDFObjectHandle>::iterator iter =
  1278 + members.begin();
  1279 + iter != members.end(); ++iter)
  1280 + {
  1281 + to_check.push_back((*iter).second);
  1282 + }
  1283 + }
  1284 + else if (obj.isArray())
  1285 + {
  1286 + std::vector<QPDFObjectHandle> elements = obj.getArrayAsVector();
  1287 + for (std::vector<QPDFObjectHandle>::iterator iter =
  1288 + elements.begin();
  1289 + iter != elements.end(); ++iter)
  1290 + {
  1291 + to_check.push_back(*iter);
  1292 + }
  1293 + }
  1294 + for (std::list<QPDFObjectHandle>::iterator iter = to_check.begin();
  1295 + iter != to_check.end(); ++iter)
  1296 + {
  1297 + QPDFObjectHandle sub = *iter;
  1298 + if (sub.isIndirect())
  1299 + {
  1300 + if (sub.getOwningQPDF() == this)
  1301 + {
  1302 + QPDFObjGen og(sub.getObjGen());
  1303 + if (this->m->obj_cache.count(og) == 0)
  1304 + {
  1305 + QTC::TC("qpdf", "QPDF detected dangling ref");
  1306 + queue.push_back(sub);
  1307 + }
  1308 + }
  1309 + }
  1310 + else
  1311 + {
  1312 + queue.push_back(sub);
  1313 + }
  1314 + }
  1315 +
  1316 + }
  1317 +}
  1318 +
1221 1319 size_t
1222 1320 QPDF::getObjectCount()
1223 1321 {
1224 1322 // This method returns the next available indirect object number.
1225   - // makeIndirectObject uses it for this purpose.
1226   - QPDFObjGen o1(0, 0);
  1323 + // makeIndirectObject uses it for this purpose. After
  1324 + // fixDanglingReferences is called, all objects in the xref table
  1325 + // will also be in obj_cache.
  1326 + fixDanglingReferences();
  1327 + QPDFObjGen og(0, 0);
1227 1328 if (! this->m->obj_cache.empty())
1228 1329 {
1229   - o1 = (*(this->m->obj_cache.rbegin())).first;
1230   - }
1231   - QPDFObjGen o2(0, 0);
1232   - if (! this->m->xref_table.empty())
1233   - {
1234   - o2 = (*(this->m->xref_table.rbegin())).first;
  1330 + og = (*(this->m->obj_cache.rbegin())).first;
1235 1331 }
1236   - QTC::TC("qpdf", "QPDF indirect last obj from xref",
1237   - (o2.getObj() > o1.getObj()) ? 1 : 0);
1238   - return std::max(o1.getObj(), o2.getObj());
  1332 + return og.getObj();
1239 1333 }
1240 1334  
1241 1335 std::vector<QPDFObjectHandle>
1242 1336 QPDF::getAllObjects()
1243 1337 {
  1338 + // After fixDanglingReferences is called, all objects are in the
  1339 + // object cache.
  1340 + fixDanglingReferences(true);
1244 1341 std::vector<QPDFObjectHandle> result;
1245   - for (std::map<QPDFObjGen, QPDFXRefEntry>::iterator iter =
1246   - this->m->xref_table.begin();
1247   - iter != this->m->xref_table.end(); ++iter)
  1342 + for (std::map<QPDFObjGen, ObjCache>::iterator iter =
  1343 + this->m->obj_cache.begin();
  1344 + iter != this->m->obj_cache.end(); ++iter)
1248 1345 {
1249 1346  
1250 1347 QPDFObjGen const& og = (*iter).first;
... ... @@ -1752,7 +1849,6 @@ QPDF::resolve(int objid, int generation)
1752 1849 }
1753 1850 ResolveRecorder rr(this, og);
1754 1851  
1755   - // PDF spec says unknown objects resolve to the null object.
1756 1852 if ((! this->m->obj_cache.count(og)) && this->m->xref_table.count(og))
1757 1853 {
1758 1854 QPDFXRefEntry const& entry = this->m->xref_table[og];
... ... @@ -1800,6 +1896,7 @@ QPDF::resolve(int objid, int generation)
1800 1896 }
1801 1897 if (this->m->obj_cache.count(og) == 0)
1802 1898 {
  1899 + // PDF spec says unknown objects resolve to the null object.
1803 1900 QTC::TC("qpdf", "QPDF resolve failure to null");
1804 1901 QPDFObjectHandle oh = QPDFObjectHandle::newNull();
1805 1902 this->m->obj_cache[og] =
... ...
libqpdf/QPDFWriter.cc
... ... @@ -2244,6 +2244,7 @@ QPDFWriter::prepareFileForWrite()
2244 2244 // includes stream lengths, stream filtering parameters, and
2245 2245 // document extension level information.
2246 2246  
  2247 + this->m->pdf.fixDanglingReferences(true);
2247 2248 std::list<QPDFObjectHandle> queue;
2248 2249 queue.push_back(getTrimmedTrailer());
2249 2250 std::set<int> visited;
... ...
qpdf/qpdf.testcov
... ... @@ -82,7 +82,6 @@ QPDFObjectHandle clone dictionary 0
82 82 QPDFObjectHandle makeDirect loop 0
83 83 QPDFObjectHandle ERR clone stream 0
84 84 QPDFTokenizer allow pound anywhere in name 0
85   -QPDF indirect last obj from xref 1
86 85 QPDF default for xref stream field 0 0
87 86 QPDF prev key in xref stream dictionary 0
88 87 QPDF prev key in trailer dictionary 0
... ... @@ -402,3 +401,4 @@ QPDFFormFieldObjectHelper list not found 0
402 401 QPDFFormFieldObjectHelper list found 0
403 402 QPDFFormFieldObjectHelper list first too low 0
404 403 QPDFFormFieldObjectHelper list last too high 0
  404 +QPDF detected dangling ref 0
... ...
qpdf/qtest/qpdf.test
... ... @@ -175,6 +175,22 @@ $td-&gt;runtest(&quot;\@file exists and file doesn&#39;t&quot;,
175 175  
176 176 show_ntests();
177 177 # ----------
  178 +$td->notify("--- Dangling Refs ---");
  179 +my @dangling = (qw(minimal dangling-refs));
  180 +$n_tests += 2 * scalar(@dangling);
  181 +
  182 +foreach my $f (@dangling)
  183 +{
  184 + $td->runtest("dangling refs: $f",
  185 + {$td->COMMAND => "test_driver 53 $f.pdf"},
  186 + {$td->FILE => "$f-dangling.out", $td->EXIT_STATUS => 0},
  187 + $td->NORMALIZE_NEWLINES);
  188 + $td->runtest("check output",
  189 + {$td->FILE => "a.pdf"},
  190 + {$td->FILE => "$f-dangling-out.pdf"});
  191 +}
  192 +show_ntests();
  193 +# ----------
178 194 $td->notify("--- Form Tests ---");
179 195  
180 196 my @form_tests = (
... ...
qpdf/qtest/qpdf/dangling-refs-dangling-out.pdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/dangling-refs-dangling.out 0 → 100644
  1 +all objects
  2 +1 0 R
  3 +2 0 R
  4 +3 0 R
  5 +4 0 R
  6 +5 0 R
  7 +6 0 R
  8 +7 0 R
  9 +8 0 R
  10 +9 0 R
  11 +10 0 R
  12 +11 0 R
  13 +test 53 done
... ...
qpdf/qtest/qpdf/dangling-refs.pdf 0 → 100644
  1 +%PDF-1.3
  2 +%¿÷¢þ
  3 +%QDF-1.0
  4 +
  5 +1 0 obj
  6 +<<
  7 + /Pages 2 0 R
  8 + /Type /Catalog
  9 + /Dangling 8 0 R
  10 + /AlsoDangling [
  11 + 9 0 R
  12 + <<
  13 + /yes 2 0 R
  14 + /no 10 0 R
  15 + /nope 8 0 R
  16 + >>
  17 + ]
  18 +>>
  19 +endobj
  20 +
  21 +2 0 obj
  22 +<<
  23 + /Count 1
  24 + /Kids [
  25 + 3 0 R
  26 + ]
  27 + /Type /Pages
  28 +>>
  29 +endobj
  30 +
  31 +%% Page 1
  32 +3 0 obj
  33 +<<
  34 + /Contents 4 0 R
  35 + /MediaBox [
  36 + 0
  37 + 0
  38 + 612
  39 + 792
  40 + ]
  41 + /Parent 2 0 R
  42 + /Resources <<
  43 + /Font <<
  44 + /F1 6 0 R
  45 + >>
  46 + /ProcSet 7 0 R
  47 + >>
  48 + /Type /Page
  49 +>>
  50 +endobj
  51 +
  52 +%% Contents for page 1
  53 +4 0 obj
  54 +<<
  55 + /Length 5 0 R
  56 +>>
  57 +stream
  58 +BT
  59 + /F1 24 Tf
  60 + 72 720 Td
  61 + (Potato) Tj
  62 +ET
  63 +endstream
  64 +endobj
  65 +
  66 +5 0 obj
  67 +44
  68 +endobj
  69 +
  70 +6 0 obj
  71 +<<
  72 + /BaseFont /Helvetica
  73 + /Encoding /WinAnsiEncoding
  74 + /Name /F1
  75 + /Subtype /Type1
  76 + /Type /Font
  77 +>>
  78 +endobj
  79 +
  80 +7 0 obj
  81 +[
  82 + /PDF
  83 + /Text
  84 +]
  85 +endobj
  86 +
  87 +xref
  88 +0 8
  89 +0000000000 65535 f
  90 +0000000025 00000 n
  91 +0000000195 00000 n
  92 +0000000277 00000 n
  93 +0000000492 00000 n
  94 +0000000591 00000 n
  95 +0000000610 00000 n
  96 +0000000728 00000 n
  97 +trailer <<
  98 + /Root 1 0 R
  99 + /Size 8
  100 + /ID [<7141a6cf32de469328cf0f51982b5f89><7141a6cf32de469328cf0f51982b5f89>]
  101 +>>
  102 +startxref
  103 +763
  104 +%%EOF
... ...
qpdf/qtest/qpdf/issue-101.out
... ... @@ -122,6 +122,32 @@ WARNING: issue-101.pdf (object 11 0, offset 1357): unknown token while reading o
122 122 WARNING: issue-101.pdf (object 11 0, offset 1359): unknown token while reading object; treating as string
123 123 WARNING: issue-101.pdf (object 11 0, offset 1368): unexpected )
124 124 WARNING: issue-101.pdf (object 11 0, offset 1373): expected endobj
  125 +WARNING: issue-101.pdf (object 2 0, offset 244): unknown token while reading object; treating as string
  126 +WARNING: issue-101.pdf (object 7 0, offset 3855): unknown token while reading object; treating as string
  127 +WARNING: issue-101.pdf (object 7 0, offset 3863): treating unexpected brace token as null
  128 +WARNING: issue-101.pdf (object 7 0, offset 3864): unknown token while reading object; treating as string
  129 +WARNING: issue-101.pdf (object 7 0, offset 3866): unknown token while reading object; treating as string
  130 +WARNING: issue-101.pdf (object 7 0, offset 3873): unknown token while reading object; treating as string
  131 +WARNING: issue-101.pdf (object 7 0, offset 3879): unknown token while reading object; treating as string
  132 +WARNING: issue-101.pdf (object 7 0, offset 3888): unknown token while reading object; treating as string
  133 +WARNING: issue-101.pdf (object 7 0, offset 3901): unknown token while reading object; treating as string
  134 +WARNING: issue-101.pdf (object 7 0, offset 3905): unknown token while reading object; treating as string
  135 +WARNING: issue-101.pdf (object 7 0, offset 3913): unknown token while reading object; treating as string
  136 +WARNING: issue-101.pdf (object 7 0, offset 3847): expected dictionary key but found non-name object; inserting key /QPDFFake1
  137 +WARNING: issue-101.pdf (object 7 0, offset 3847): expected dictionary key but found non-name object; inserting key /QPDFFake2
  138 +WARNING: issue-101.pdf (object 7 0, offset 3847): expected dictionary key but found non-name object; inserting key /QPDFFake3
  139 +WARNING: issue-101.pdf (object 7 0, offset 3847): expected dictionary key but found non-name object; inserting key /QPDFFake4
  140 +WARNING: issue-101.pdf (object 7 0, offset 3847): expected dictionary key but found non-name object; inserting key /QPDFFake5
  141 +WARNING: issue-101.pdf (object 7 0, offset 3847): expected dictionary key but found non-name object; inserting key /QPDFFake6
  142 +WARNING: issue-101.pdf (object 7 0, offset 3847): expected dictionary key but found non-name object; inserting key /QPDFFake7
  143 +WARNING: issue-101.pdf (object 7 0, offset 3847): expected dictionary key but found non-name object; inserting key /QPDFFake8
  144 +WARNING: issue-101.pdf (object 7 0, offset 3847): expected dictionary key but found non-name object; inserting key /QPDFFake9
  145 +WARNING: issue-101.pdf (object 7 0, offset 3847): expected dictionary key but found non-name object; inserting key /QPDFFake10
  146 +WARNING: issue-101.pdf (object 7 0, offset 3844): stream dictionary lacks /Length key
  147 +WARNING: issue-101.pdf (object 7 0, offset 3962): attempting to recover stream length
  148 +WARNING: issue-101.pdf (object 7 0, offset 3962): recovered stream length: 12
125 149 WARNING: issue-101.pdf (object 8 0, offset 4067): invalid character ()) in hexstring
126 150 WARNING: issue-101.pdf (object 8 0, offset 4069): expected endobj
  151 +WARNING: issue-101.pdf (object 9 0, offset 2832): unknown token while reading object; treating as string
  152 +WARNING: issue-101.pdf (object 9 0, offset 2834): expected endobj
127 153 qpdf: operation succeeded with warnings; resulting file may have some problems
... ...
qpdf/qtest/qpdf/issue-117.out
... ... @@ -5,4 +5,12 @@ WARNING: issue-117.pdf (offset 66): loop detected resolving object 2 0
5 5 WARNING: issue-117.pdf (object 2 0, offset 22): /Length key in stream dictionary is not an integer
6 6 WARNING: issue-117.pdf (object 2 0, offset 67): attempting to recover stream length
7 7 WARNING: issue-117.pdf (object 2 0, offset 67): recovered stream length: 91
  8 +WARNING: issue-117.pdf (object 5 0, offset 1559): expected endstream
  9 +WARNING: issue-117.pdf (object 5 0, offset 349): attempting to recover stream length
  10 +WARNING: issue-117.pdf (object 5 0, offset 349): recovered stream length: 762
  11 +WARNING: issue-117.pdf (object 5 0, offset 1121): expected endobj
  12 +WARNING: issue-117.pdf (object 7 0, offset 1791): unknown token while reading object; treating as string
  13 +WARNING: issue-117.pdf (object 7 0, offset 1267): /Length key in stream dictionary is not an integer
  14 +WARNING: issue-117.pdf (object 7 0, offset 1418): attempting to recover stream length
  15 +WARNING: issue-117.pdf (object 7 0, offset 1418): recovered stream length: 347
8 16 attempt to make a stream into a direct object
... ...
qpdf/qtest/qpdf/issue-120.out
1 1 WARNING: issue-120.pdf (offset 85): loop detected resolving object 3 0
2 2 WARNING: issue-120.pdf (object 6 0, offset 85): supposed object stream 3 is not a stream
  3 +WARNING: issue-120.pdf: file is damaged
  4 +WARNING: issue-120.pdf (object 8 10, offset 26880): expected n n obj
  5 +WARNING: issue-120.pdf: Attempting to reconstruct cross-reference table
  6 +WARNING: issue-120.pdf: object 8 10 not found in file after regenerating cross reference table
3 7 qpdf: operation succeeded with warnings; resulting file may have some problems
... ...
qpdf/qtest/qpdf/issue-143.out
... ... @@ -14,4 +14,7 @@ WARNING: issue-143.pdf (object 1 0, offset 21): stream dictionary lacks /Length
14 14 WARNING: issue-143.pdf (object 1 0, offset 84): attempting to recover stream length
15 15 WARNING: issue-143.pdf (object 1 0, offset 84): recovered stream length: 606
16 16 WARNING: issue-143.pdf object stream 1 (object 2 0, offset 33): expected dictionary key but found non-name object; inserting key /QPDFFake1
  17 +WARNING: issue-143.pdf (object 2 0, offset 84): supposed object stream 12336 is not a stream
  18 +WARNING: issue-143.pdf (object 2 0, offset 84): supposed object stream 12336 is not a stream
  19 +WARNING: issue-143.pdf (object 2 0, offset 84): supposed object stream 12336 is not a stream
17 20 qpdf: operation succeeded with warnings; resulting file may have some problems
... ...
qpdf/qtest/qpdf/issue-51.out
... ... @@ -8,3 +8,8 @@ WARNING: issue-51.pdf (object 2 0, offset 71): attempting to recover stream leng
8 8 WARNING: issue-51.pdf (object 2 0, offset 71): unable to recover stream data; treating stream as empty
9 9 WARNING: issue-51.pdf (object 2 0, offset 977): expected endobj
10 10 WARNING: issue-51.pdf (object 2 0, offset 977): EOF after endobj
  11 +WARNING: issue-51.pdf (object 3 0): object has offset 0
  12 +WARNING: issue-51.pdf (object 4 0): object has offset 0
  13 +WARNING: issue-51.pdf (object 5 0): object has offset 0
  14 +WARNING: issue-51.pdf (object 6 0): object has offset 0
  15 +WARNING: issue-51.pdf (object 8 0): object has offset 0
... ...
qpdf/qtest/qpdf/minimal-dangling-out.pdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/minimal-dangling.out 0 → 100644
  1 +all objects
  2 +1 0 R
  3 +2 0 R
  4 +3 0 R
  5 +4 0 R
  6 +5 0 R
  7 +6 0 R
  8 +7 0 R
  9 +test 53 done
... ...
qpdf/test_driver.cc
... ... @@ -1846,6 +1846,25 @@ void runtest(int n, char const* filename1, char const* arg2)
1846 1846 QPDFWriter w(pdf, "a.pdf");
1847 1847 w.write();
1848 1848 }
  1849 + else if (n == 53)
  1850 + {
  1851 + // Test get all objects and dangling ref handling
  1852 + QPDFObjectHandle root = pdf.getRoot();
  1853 + root.replaceKey(
  1854 + "/Q1",
  1855 + pdf.makeIndirectObject(QPDFObjectHandle::newString("potato")));
  1856 + std::cout << "all objects" << std::endl;
  1857 + std::vector<QPDFObjectHandle> all = pdf.getAllObjects();
  1858 + for (std::vector<QPDFObjectHandle>::iterator iter = all.begin();
  1859 + iter != all.end(); ++iter)
  1860 + {
  1861 + std::cout << (*iter).unparse() << std::endl;
  1862 + }
  1863 +
  1864 + QPDFWriter w(pdf, "a.pdf");
  1865 + w.setStaticID(true);
  1866 + w.write();
  1867 + }
1849 1868 else
1850 1869 {
1851 1870 throw std::runtime_error(std::string("invalid test ") +
... ...