Commit 8971443e4680fc1c0babe56da58cc9070a9dae2e
1 parent
ec48820c
QPDF::addPage*: handle duplicate pages more robustly
Showing
8 changed files
with
64 additions
and
42 deletions
ChangeLog
| 1 | +2021-04-05 Jay Berkenbilt <ejb@ql.org> | ||
| 2 | + | ||
| 3 | + * When adding a page, if the page already exists, make a shallow | ||
| 4 | + copy of the page instead of throwing an exception. This makes the | ||
| 5 | + behavior of adding a page from the library consistent with what | ||
| 6 | + the CLI does and also with what the library does if it starts with | ||
| 7 | + a file that already has a duplicated page. Note that this means | ||
| 8 | + that, in some cases, the page you pass to addPage or addPageAt | ||
| 9 | + (either in QPDF or QPDFPageDocumentHelper) will not be the same | ||
| 10 | + object that actually gets added. (This has actually always been | ||
| 11 | + the case.) That means that, if you are going to do subsequent | ||
| 12 | + modification on the page, you should retrieve it again. | ||
| 13 | + | ||
| 1 | 2021-03-11 Jay Berkenbilt <ejb@ql.org> | 14 | 2021-03-11 Jay Berkenbilt <ejb@ql.org> |
| 2 | 15 | ||
| 3 | * 10.3.1: release | 16 | * 10.3.1: release |
TODO
| 1 | Document-level work | 1 | Document-level work |
| 2 | =================== | 2 | =================== |
| 3 | 3 | ||
| 4 | -* Implement the tree helper described in the ABI section and provide | ||
| 5 | - the getPagesTree method so QPDF::getAllPages can be deprecated in | ||
| 6 | - 10.4 and people have time to transition before qpdf 11. Maybe I | ||
| 7 | - should have a qpdf 11 branch so I can make sure I have the | ||
| 8 | - interfaces the way I want them before releasing 10.4. | ||
| 9 | - | ||
| 10 | * QPDFPageCopier -- object for moving pages around within files or | 4 | * QPDFPageCopier -- object for moving pages around within files or |
| 11 | between files and performing various transformations | 5 | between files and performing various transformations |
| 12 | 6 | ||
| @@ -188,25 +182,6 @@ Comments appear in the code prefixed by "ABI" | @@ -188,25 +182,6 @@ Comments appear in the code prefixed by "ABI" | ||
| 188 | before copying, though maybe we don't because it could cause | 182 | before copying, though maybe we don't because it could cause |
| 189 | multiple copies to be made...usually it's better to handle that | 183 | multiple copies to be made...usually it's better to handle that |
| 190 | explicitly. | 184 | explicitly. |
| 191 | -* (Also API) Create a helper class for tree structures like the pages | ||
| 192 | - tree that have /Kids and /Count. It would be great to be able to | ||
| 193 | - eliminate flattenPagesTree and all the page object to page caching, | ||
| 194 | - and to get rid of getAllPages() that returns a reference to an | ||
| 195 | - internal data structure. The tree helper object can have | ||
| 196 | - bidirectional iterator and const_iterator, implement at and | ||
| 197 | - operator[], and have reasonable insertion and deletion semantics. | ||
| 198 | - Then a new getAllPages, perhaps getPagesTree, can return that and | ||
| 199 | - make it easy to change existing code even if it assumes the result | ||
| 200 | - of getAllPages() is mutating automatically as the pages tree is | ||
| 201 | - manipulated. This will free us up to finally get rid of the | ||
| 202 | - troublesome pages cache and its exposure by getAllPages. See also | ||
| 203 | - note below with "balance the pages tree". Maybe we can get rid of | ||
| 204 | - pageobj_to_pages_pos as well or convert it into something that is | ||
| 205 | - merely advisory and not so heavily trusted. We should either make | ||
| 206 | - the code not care if the same object appears more than once in the | ||
| 207 | - pages tree or detect it and shallow copy, though the latter would | ||
| 208 | - cause a significant performance penalty when manipulating pages if | ||
| 209 | - we get rid of the cache. | ||
| 210 | 185 | ||
| 211 | Page splitting/merging | 186 | Page splitting/merging |
| 212 | ====================== | 187 | ====================== |
include/qpdf/QPDFPageDocumentHelper.hh
| @@ -73,12 +73,21 @@ class QPDFPageDocumentHelper: public QPDFDocumentHelper | @@ -73,12 +73,21 @@ class QPDFPageDocumentHelper: public QPDFDocumentHelper | ||
| 73 | // indirect. If it is an indirect object from another QPDF, this | 73 | // indirect. If it is an indirect object from another QPDF, this |
| 74 | // method will call pushInheritedAttributesToPage on the other | 74 | // method will call pushInheritedAttributesToPage on the other |
| 75 | // file and then copy the page to this QPDF using the same | 75 | // file and then copy the page to this QPDF using the same |
| 76 | - // underlying code as copyForeignObject. Note that you can call | ||
| 77 | - // copyForeignObject directly to copy a page from a different | ||
| 78 | - // file, but the resulting object will not be a page in the new | ||
| 79 | - // file. You could do this, for example, to convert a page into a | ||
| 80 | - // form XObject, though for that, you're better off using | ||
| 81 | - // QPDFPageObjectHelper::getFormXObjectForPage. | 76 | + // underlying code as copyForeignObject. At this stage, if the |
| 77 | + // indirect object is already in the pages tree, a shallow copy is | ||
| 78 | + // made to avoid adding the same page more than once. In version | ||
| 79 | + // 10.3.1 and earlier, adding a page that already existed would | ||
| 80 | + // throw an exception and could cause qpdf to crash on subsequent | ||
| 81 | + // page insertions in some cases. Note that this means that, in | ||
| 82 | + // some cases, the page actually added won't be exactly the same | ||
| 83 | + // object as the one passed in. If you want to do subsequent | ||
| 84 | + // modification on the page, you should retrieve it again. | ||
| 85 | + // | ||
| 86 | + // Note that you can call copyForeignObject directly to copy a | ||
| 87 | + // page from a different file, but the resulting object will not | ||
| 88 | + // be a page in the new file. You could do this, for example, to | ||
| 89 | + // convert a page into a form XObject, though for that, you're | ||
| 90 | + // better off using QPDFPageObjectHelper::getFormXObjectForPage. | ||
| 82 | // | 91 | // |
| 83 | // This method does not have any specific awareness of annotations | 92 | // This method does not have any specific awareness of annotations |
| 84 | // or form fields, so if you just add a page without thinking | 93 | // or form fields, so if you just add a page without thinking |
libqpdf/QPDF_pages.cc
| @@ -14,7 +14,15 @@ | @@ -14,7 +14,15 @@ | ||
| 14 | // to all_pages and has been in the public API long before the | 14 | // to all_pages and has been in the public API long before the |
| 15 | // introduction of mutation APIs, so we're pretty much stuck with it. | 15 | // introduction of mutation APIs, so we're pretty much stuck with it. |
| 16 | // Anyway, there are lots of calls to it in the library, so the | 16 | // Anyway, there are lots of calls to it in the library, so the |
| 17 | -// efficiency of having it cached is probably worth keeping it. | 17 | +// efficiency of having it cached is probably worth keeping it. At one |
| 18 | +// point, I had partially implemented a helper class specifically for | ||
| 19 | +// the pages tree, but once you work in all the logic that handles | ||
| 20 | +// repairing the /Type keys of page tree nodes (both /Pages and /Page) | ||
| 21 | +// and deal with duplicate pages, it's just as complex and less | ||
| 22 | +// efficient than what's here. So, in spite of the fact that a const | ||
| 23 | +// reference is returned, the current code is fine and does not need | ||
| 24 | +// to be replaced. A partial implementation of QPDFPagesTree is in | ||
| 25 | +// github in attic in case there is ever a reason to resurrect it. | ||
| 18 | 26 | ||
| 19 | // The goal of this code is to ensure that the all_pages vector, which | 27 | // The goal of this code is to ensure that the all_pages vector, which |
| 20 | // users may have a reference to, and the pageobj_to_pages_pos map, | 28 | // users may have a reference to, and the pageobj_to_pages_pos map, |
| @@ -178,7 +186,10 @@ QPDF::flattenPagesTree() | @@ -178,7 +186,10 @@ QPDF::flattenPagesTree() | ||
| 178 | size_t const len = this->m->all_pages.size(); | 186 | size_t const len = this->m->all_pages.size(); |
| 179 | for (size_t pos = 0; pos < len; ++pos) | 187 | for (size_t pos = 0; pos < len; ++pos) |
| 180 | { | 188 | { |
| 181 | - // populate pageobj_to_pages_pos and fix parent pointer | 189 | + // Populate pageobj_to_pages_pos and fix parent pointer. There |
| 190 | + // should be no duplicates at this point because | ||
| 191 | + // pushInheritedAttributesToPage calls getAllPages which | ||
| 192 | + // resolves duplicates. | ||
| 182 | insertPageobjToPage(this->m->all_pages.at(pos), toI(pos), true); | 193 | insertPageobjToPage(this->m->all_pages.at(pos), toI(pos), true); |
| 183 | this->m->all_pages.at(pos).replaceKey("/Parent", pages); | 194 | this->m->all_pages.at(pos).replaceKey("/Parent", pages); |
| 184 | } | 195 | } |
| @@ -201,7 +212,8 @@ QPDF::insertPageobjToPage(QPDFObjectHandle const& obj, int pos, | @@ -201,7 +212,8 @@ QPDF::insertPageobjToPage(QPDFObjectHandle const& obj, int pos, | ||
| 201 | if (! this->m->pageobj_to_pages_pos.insert( | 212 | if (! this->m->pageobj_to_pages_pos.insert( |
| 202 | std::make_pair(og, pos)).second) | 213 | std::make_pair(og, pos)).second) |
| 203 | { | 214 | { |
| 204 | - QTC::TC("qpdf", "QPDF duplicate page reference"); | 215 | + // The library never calls insertPageobjToPage in a way |
| 216 | + // that causes this to happen. | ||
| 205 | setLastObjectDescription("page " + QUtil::int_to_string(pos) + | 217 | setLastObjectDescription("page " + QUtil::int_to_string(pos) + |
| 206 | " (numbered from zero)", | 218 | " (numbered from zero)", |
| 207 | og.getObj(), og.getGen()); | 219 | og.getObj(), og.getGen()); |
| @@ -246,6 +258,13 @@ QPDF::insertPage(QPDFObjectHandle newpage, int pos) | @@ -246,6 +258,13 @@ QPDF::insertPage(QPDFObjectHandle newpage, int pos) | ||
| 246 | (pos == QIntC::to_int(this->m->all_pages.size())) ? 1 : // at end | 258 | (pos == QIntC::to_int(this->m->all_pages.size())) ? 1 : // at end |
| 247 | 2); // insert in middle | 259 | 2); // insert in middle |
| 248 | 260 | ||
| 261 | + auto og = newpage.getObjGen(); | ||
| 262 | + if (this->m->pageobj_to_pages_pos.count(og)) | ||
| 263 | + { | ||
| 264 | + QTC::TC("qpdf", "QPDF resolve duplicated page in insert"); | ||
| 265 | + newpage = makeIndirectObject(QPDFObjectHandle(newpage).shallowCopy()); | ||
| 266 | + } | ||
| 267 | + | ||
| 249 | QPDFObjectHandle pages = getRoot().getKey("/Pages"); | 268 | QPDFObjectHandle pages = getRoot().getKey("/Pages"); |
| 250 | QPDFObjectHandle kids = pages.getKey("/Kids"); | 269 | QPDFObjectHandle kids = pages.getKey("/Kids"); |
| 251 | assert ((pos >= 0) && (QIntC::to_size(pos) <= this->m->all_pages.size())); | 270 | assert ((pos >= 0) && (QIntC::to_size(pos) <= this->m->all_pages.size())); |
qpdf/qpdf.testcov
| @@ -193,7 +193,6 @@ qpdf-c called qpdf_init_write_memory 0 | @@ -193,7 +193,6 @@ qpdf-c called qpdf_init_write_memory 0 | ||
| 193 | exercise processFile(name) 0 | 193 | exercise processFile(name) 0 |
| 194 | exercise processFile(FILE*) 0 | 194 | exercise processFile(FILE*) 0 |
| 195 | exercise processMemoryFile 0 | 195 | exercise processMemoryFile 0 |
| 196 | -QPDF duplicate page reference 0 | ||
| 197 | QPDF remove page 2 | 196 | QPDF remove page 2 |
| 198 | QPDF insert page 2 | 197 | QPDF insert page 2 |
| 199 | QPDF updateAllPagesCache 0 | 198 | QPDF updateAllPagesCache 0 |
| @@ -592,3 +591,4 @@ QPDFAcroFormDocumentHelper /DA parse error 0 | @@ -592,3 +591,4 @@ QPDFAcroFormDocumentHelper /DA parse error 0 | ||
| 592 | QPDFAcroFormDocumentHelper AP parse error 0 | 591 | QPDFAcroFormDocumentHelper AP parse error 0 |
| 593 | qpdf copy fields not this file 0 | 592 | qpdf copy fields not this file 0 |
| 594 | qpdf copy fields non-first from orig 0 | 593 | qpdf copy fields non-first from orig 0 |
| 594 | +QPDF resolve duplicated page in insert 0 |
qpdf/qtest/qpdf.test
| @@ -954,7 +954,7 @@ $td->runtest("check output", | @@ -954,7 +954,7 @@ $td->runtest("check output", | ||
| 954 | {$td->FILE => "page_api_1-out3.pdf"}); | 954 | {$td->FILE => "page_api_1-out3.pdf"}); |
| 955 | $td->runtest("duplicate page", | 955 | $td->runtest("duplicate page", |
| 956 | {$td->COMMAND => "test_driver 19 page_api_1.pdf"}, | 956 | {$td->COMMAND => "test_driver 19 page_api_1.pdf"}, |
| 957 | - {$td->FILE => "page_api_1.out", $td->EXIT_STATUS => 2}, | 957 | + {$td->FILE => "page_api_1.out", $td->EXIT_STATUS => 0}, |
| 958 | $td->NORMALIZE_NEWLINES); | 958 | $td->NORMALIZE_NEWLINES); |
| 959 | $td->runtest("remove page we don't have", | 959 | $td->runtest("remove page we don't have", |
| 960 | {$td->COMMAND => "test_driver 22 page_api_1.pdf"}, | 960 | {$td->COMMAND => "test_driver 22 page_api_1.pdf"}, |
qpdf/qtest/qpdf/page_api_1.out
qpdf/test_driver.cc
| @@ -970,12 +970,18 @@ void runtest(int n, char const* filename1, char const* arg2) | @@ -970,12 +970,18 @@ void runtest(int n, char const* filename1, char const* arg2) | ||
| 970 | else if (n == 19) | 970 | else if (n == 19) |
| 971 | { | 971 | { |
| 972 | // Remove a page and re-insert it in the same file. | 972 | // Remove a page and re-insert it in the same file. |
| 973 | - QPDFPageDocumentHelper dh(pdf); | ||
| 974 | - std::vector<QPDFPageObjectHelper> pages = dh.getAllPages(); | 973 | + std::vector<QPDFObjectHandle> const& pages = pdf.getAllPages(); |
| 975 | 974 | ||
| 976 | - // Try to insert a page that's already there. | ||
| 977 | - dh.addPage(pages.at(5), false); | ||
| 978 | - std::cout << "you can't see this" << std::endl; | 975 | + // Try to insert a page that's already there. A shallow copy |
| 976 | + // gets inserted instead. | ||
| 977 | + auto newpage = pages.at(5); | ||
| 978 | + size_t count = pages.size(); | ||
| 979 | + pdf.addPage(newpage, false); | ||
| 980 | + auto last = pages.back(); | ||
| 981 | + assert(pages.size() == count + 1); | ||
| 982 | + assert(! (last.getObjGen() == newpage.getObjGen())); | ||
| 983 | + assert(last.getKey("/Contents").getObjGen() == | ||
| 984 | + newpage.getKey("/Contents").getObjGen()); | ||
| 979 | } | 985 | } |
| 980 | else if (n == 20) | 986 | else if (n == 20) |
| 981 | { | 987 | { |