Commit 8971443e4680fc1c0babe56da58cc9070a9dae2e
1 parent
ec48820c
QPDF::addPage*: handle duplicate pages more robustly
Showing
8 changed files
with
64 additions
and
42 deletions
ChangeLog
| 1 | +2021-04-05 Jay Berkenbilt <ejb@ql.org> | |
| 2 | + | |
| 3 | + * When adding a page, if the page already exists, make a shallow | |
| 4 | + copy of the page instead of throwing an exception. This makes the | |
| 5 | + behavior of adding a page from the library consistent with what | |
| 6 | + the CLI does and also with what the library does if it starts with | |
| 7 | + a file that already has a duplicated page. Note that this means | |
| 8 | + that, in some cases, the page you pass to addPage or addPageAt | |
| 9 | + (either in QPDF or QPDFPageDocumentHelper) will not be the same | |
| 10 | + object that actually gets added. (This has actually always been | |
| 11 | + the case.) That means that, if you are going to do subsequent | |
| 12 | + modification on the page, you should retrieve it again. | |
| 13 | + | |
| 1 | 14 | 2021-03-11 Jay Berkenbilt <ejb@ql.org> |
| 2 | 15 | |
| 3 | 16 | * 10.3.1: release | ... | ... |
TODO
| 1 | 1 | Document-level work |
| 2 | 2 | =================== |
| 3 | 3 | |
| 4 | -* Implement the tree helper described in the ABI section and provide | |
| 5 | - the getPagesTree method so QPDF::getAllPages can be deprecated in | |
| 6 | - 10.4 and people have time to transition before qpdf 11. Maybe I | |
| 7 | - should have a qpdf 11 branch so I can make sure I have the | |
| 8 | - interfaces the way I want them before releasing 10.4. | |
| 9 | - | |
| 10 | 4 | * QPDFPageCopier -- object for moving pages around within files or |
| 11 | 5 | between files and performing various transformations |
| 12 | 6 | |
| ... | ... | @@ -188,25 +182,6 @@ Comments appear in the code prefixed by "ABI" |
| 188 | 182 | before copying, though maybe we don't because it could cause |
| 189 | 183 | multiple copies to be made...usually it's better to handle that |
| 190 | 184 | explicitly. |
| 191 | -* (Also API) Create a helper class for tree structures like the pages | |
| 192 | - tree that have /Kids and /Count. It would be great to be able to | |
| 193 | - eliminate flattenPagesTree and all the page object to page caching, | |
| 194 | - and to get rid of getAllPages() that returns a reference to an | |
| 195 | - internal data structure. The tree helper object can have | |
| 196 | - bidirectional iterator and const_iterator, implement at and | |
| 197 | - operator[], and have reasonable insertion and deletion semantics. | |
| 198 | - Then a new getAllPages, perhaps getPagesTree, can return that and | |
| 199 | - make it easy to change existing code even if it assumes the result | |
| 200 | - of getAllPages() is mutating automatically as the pages tree is | |
| 201 | - manipulated. This will free us up to finally get rid of the | |
| 202 | - troublesome pages cache and its exposure by getAllPages. See also | |
| 203 | - note below with "balance the pages tree". Maybe we can get rid of | |
| 204 | - pageobj_to_pages_pos as well or convert it into something that is | |
| 205 | - merely advisory and not so heavily trusted. We should either make | |
| 206 | - the code not care if the same object appears more than once in the | |
| 207 | - pages tree or detect it and shallow copy, though the latter would | |
| 208 | - cause a significant performance penalty when manipulating pages if | |
| 209 | - we get rid of the cache. | |
| 210 | 185 | |
| 211 | 186 | Page splitting/merging |
| 212 | 187 | ====================== | ... | ... |
include/qpdf/QPDFPageDocumentHelper.hh
| ... | ... | @@ -73,12 +73,21 @@ class QPDFPageDocumentHelper: public QPDFDocumentHelper |
| 73 | 73 | // indirect. If it is an indirect object from another QPDF, this |
| 74 | 74 | // method will call pushInheritedAttributesToPage on the other |
| 75 | 75 | // file and then copy the page to this QPDF using the same |
| 76 | - // underlying code as copyForeignObject. Note that you can call | |
| 77 | - // copyForeignObject directly to copy a page from a different | |
| 78 | - // file, but the resulting object will not be a page in the new | |
| 79 | - // file. You could do this, for example, to convert a page into a | |
| 80 | - // form XObject, though for that, you're better off using | |
| 81 | - // QPDFPageObjectHelper::getFormXObjectForPage. | |
| 76 | + // underlying code as copyForeignObject. At this stage, if the | |
| 77 | + // indirect object is already in the pages tree, a shallow copy is | |
| 78 | + // made to avoid adding the same page more than once. In version | |
| 79 | + // 10.3.1 and earlier, adding a page that already existed would | |
| 80 | + // throw an exception and could cause qpdf to crash on subsequent | |
| 81 | + // page insertions in some cases. Note that this means that, in | |
| 82 | + // some cases, the page actually added won't be exactly the same | |
| 83 | + // object as the one passed in. If you want to do subsequent | |
| 84 | + // modification on the page, you should retrieve it again. | |
| 85 | + // | |
| 86 | + // Note that you can call copyForeignObject directly to copy a | |
| 87 | + // page from a different file, but the resulting object will not | |
| 88 | + // be a page in the new file. You could do this, for example, to | |
| 89 | + // convert a page into a form XObject, though for that, you're | |
| 90 | + // better off using QPDFPageObjectHelper::getFormXObjectForPage. | |
| 82 | 91 | // |
| 83 | 92 | // This method does not have any specific awareness of annotations |
| 84 | 93 | // or form fields, so if you just add a page without thinking | ... | ... |
libqpdf/QPDF_pages.cc
| ... | ... | @@ -14,7 +14,15 @@ |
| 14 | 14 | // to all_pages and has been in the public API long before the |
| 15 | 15 | // introduction of mutation APIs, so we're pretty much stuck with it. |
| 16 | 16 | // Anyway, there are lots of calls to it in the library, so the |
| 17 | -// efficiency of having it cached is probably worth keeping it. | |
| 17 | +// efficiency of having it cached is probably worth keeping it. At one | |
| 18 | +// point, I had partially implemented a helper class specifically for | |
| 19 | +// the pages tree, but once you work in all the logic that handles | |
| 20 | +// repairing the /Type keys of page tree nodes (both /Pages and /Page) | |
| 21 | +// and deal with duplicate pages, it's just as complex and less | |
| 22 | +// efficient than what's here. So, in spite of the fact that a const | |
| 23 | +// reference is returned, the current code is fine and does not need | |
| 24 | +// to be replaced. A partial implementation of QPDFPagesTree is in | |
| 25 | +// github in attic in case there is ever a reason to resurrect it. | |
| 18 | 26 | |
| 19 | 27 | // The goal of this code is to ensure that the all_pages vector, which |
| 20 | 28 | // users may have a reference to, and the pageobj_to_pages_pos map, |
| ... | ... | @@ -178,7 +186,10 @@ QPDF::flattenPagesTree() |
| 178 | 186 | size_t const len = this->m->all_pages.size(); |
| 179 | 187 | for (size_t pos = 0; pos < len; ++pos) |
| 180 | 188 | { |
| 181 | - // populate pageobj_to_pages_pos and fix parent pointer | |
| 189 | + // Populate pageobj_to_pages_pos and fix parent pointer. There | |
| 190 | + // should be no duplicates at this point because | |
| 191 | + // pushInheritedAttributesToPage calls getAllPages which | |
| 192 | + // resolves duplicates. | |
| 182 | 193 | insertPageobjToPage(this->m->all_pages.at(pos), toI(pos), true); |
| 183 | 194 | this->m->all_pages.at(pos).replaceKey("/Parent", pages); |
| 184 | 195 | } |
| ... | ... | @@ -201,7 +212,8 @@ QPDF::insertPageobjToPage(QPDFObjectHandle const& obj, int pos, |
| 201 | 212 | if (! this->m->pageobj_to_pages_pos.insert( |
| 202 | 213 | std::make_pair(og, pos)).second) |
| 203 | 214 | { |
| 204 | - QTC::TC("qpdf", "QPDF duplicate page reference"); | |
| 215 | + // The library never calls insertPageobjToPage in a way | |
| 216 | + // that causes this to happen. | |
| 205 | 217 | setLastObjectDescription("page " + QUtil::int_to_string(pos) + |
| 206 | 218 | " (numbered from zero)", |
| 207 | 219 | og.getObj(), og.getGen()); |
| ... | ... | @@ -246,6 +258,13 @@ QPDF::insertPage(QPDFObjectHandle newpage, int pos) |
| 246 | 258 | (pos == QIntC::to_int(this->m->all_pages.size())) ? 1 : // at end |
| 247 | 259 | 2); // insert in middle |
| 248 | 260 | |
| 261 | + auto og = newpage.getObjGen(); | |
| 262 | + if (this->m->pageobj_to_pages_pos.count(og)) | |
| 263 | + { | |
| 264 | + QTC::TC("qpdf", "QPDF resolve duplicated page in insert"); | |
| 265 | + newpage = makeIndirectObject(QPDFObjectHandle(newpage).shallowCopy()); | |
| 266 | + } | |
| 267 | + | |
| 249 | 268 | QPDFObjectHandle pages = getRoot().getKey("/Pages"); |
| 250 | 269 | QPDFObjectHandle kids = pages.getKey("/Kids"); |
| 251 | 270 | assert ((pos >= 0) && (QIntC::to_size(pos) <= this->m->all_pages.size())); | ... | ... |
qpdf/qpdf.testcov
| ... | ... | @@ -193,7 +193,6 @@ qpdf-c called qpdf_init_write_memory 0 |
| 193 | 193 | exercise processFile(name) 0 |
| 194 | 194 | exercise processFile(FILE*) 0 |
| 195 | 195 | exercise processMemoryFile 0 |
| 196 | -QPDF duplicate page reference 0 | |
| 197 | 196 | QPDF remove page 2 |
| 198 | 197 | QPDF insert page 2 |
| 199 | 198 | QPDF updateAllPagesCache 0 |
| ... | ... | @@ -592,3 +591,4 @@ QPDFAcroFormDocumentHelper /DA parse error 0 |
| 592 | 591 | QPDFAcroFormDocumentHelper AP parse error 0 |
| 593 | 592 | qpdf copy fields not this file 0 |
| 594 | 593 | qpdf copy fields non-first from orig 0 |
| 594 | +QPDF resolve duplicated page in insert 0 | ... | ... |
qpdf/qtest/qpdf.test
| ... | ... | @@ -954,7 +954,7 @@ $td->runtest("check output", |
| 954 | 954 | {$td->FILE => "page_api_1-out3.pdf"}); |
| 955 | 955 | $td->runtest("duplicate page", |
| 956 | 956 | {$td->COMMAND => "test_driver 19 page_api_1.pdf"}, |
| 957 | - {$td->FILE => "page_api_1.out", $td->EXIT_STATUS => 2}, | |
| 957 | + {$td->FILE => "page_api_1.out", $td->EXIT_STATUS => 0}, | |
| 958 | 958 | $td->NORMALIZE_NEWLINES); |
| 959 | 959 | $td->runtest("remove page we don't have", |
| 960 | 960 | {$td->COMMAND => "test_driver 22 page_api_1.pdf"}, | ... | ... |
qpdf/qtest/qpdf/page_api_1.out
qpdf/test_driver.cc
| ... | ... | @@ -970,12 +970,18 @@ void runtest(int n, char const* filename1, char const* arg2) |
| 970 | 970 | else if (n == 19) |
| 971 | 971 | { |
| 972 | 972 | // Remove a page and re-insert it in the same file. |
| 973 | - QPDFPageDocumentHelper dh(pdf); | |
| 974 | - std::vector<QPDFPageObjectHelper> pages = dh.getAllPages(); | |
| 973 | + std::vector<QPDFObjectHandle> const& pages = pdf.getAllPages(); | |
| 975 | 974 | |
| 976 | - // Try to insert a page that's already there. | |
| 977 | - dh.addPage(pages.at(5), false); | |
| 978 | - std::cout << "you can't see this" << std::endl; | |
| 975 | + // Try to insert a page that's already there. A shallow copy | |
| 976 | + // gets inserted instead. | |
| 977 | + auto newpage = pages.at(5); | |
| 978 | + size_t count = pages.size(); | |
| 979 | + pdf.addPage(newpage, false); | |
| 980 | + auto last = pages.back(); | |
| 981 | + assert(pages.size() == count + 1); | |
| 982 | + assert(! (last.getObjGen() == newpage.getObjGen())); | |
| 983 | + assert(last.getKey("/Contents").getObjGen() == | |
| 984 | + newpage.getKey("/Contents").getObjGen()); | |
| 979 | 985 | } |
| 980 | 986 | else if (n == 20) |
| 981 | 987 | { | ... | ... |