Commit 52f9d326a56e6409a1487c724241f91de33e3ba6
1 parent
9e01c8bd
Resolve duplicated page objects (fixes #268)
When linearizing a file or getting the list of all pages in a file, detect if the pages tree contains a duplicated page object and, if so, shallow copy it. This makes it possible to have a one to one mapping of page positions to page objects.
Showing
11 changed files
with
98 additions
and
17 deletions
ChangeLog
| 1 | +2019-01-28 Jay Berkenbilt <ejb@ql.org> | |
| 2 | + | |
| 3 | + * When linearizing or getting the list of all pages in a file, | |
| 4 | + replace duplicated page objects with a shallow copy of the page | |
| 5 | + object. Linearization and all page manipulation APIs require page | |
| 6 | + objects to be unique. Pages that were originally duplicated will | |
| 7 | + still share contents and any other indirect resources. Fixes #268. | |
| 8 | + | |
| 1 | 9 | 2019-01-26 Jay Berkenbilt <ejb@ql.org> |
| 2 | 10 | |
| 3 | 11 | * Add --overlay and --underlay options. Fixes #207. | ... | ... |
include/qpdf/QPDF.hh
| ... | ... | @@ -527,15 +527,16 @@ class QPDF |
| 527 | 527 | void optimize(std::map<int, int> const& object_stream_data, |
| 528 | 528 | bool allow_changes = true); |
| 529 | 529 | |
| 530 | - // Traverse page tree return all /Page objects. For efficiency, | |
| 531 | - // this method returns a const reference to an internal vector of | |
| 532 | - // pages. Calls to addPage, addPageAt, and removePage safely | |
| 533 | - // update this, but directly manipulation of the pages three or | |
| 534 | - // pushing inheritable objects to the page level may invalidate | |
| 535 | - // it. See comments for updateAllPagesCache() for additional | |
| 536 | - // notes. Newer code should use | |
| 537 | - // QPDFPageDocumentHelper::getAllPages instead. The decision to | |
| 538 | - // expose this internal cache was arguably incorrect, but it is | |
| 530 | + // Traverse page tree return all /Page objects. It also detects | |
| 531 | + // and resolves cases in which the same /Page object is | |
| 532 | + // duplicated. For efficiency, this method returns a const | |
| 533 | + // reference to an internal vector of pages. Calls to addPage, | |
| 534 | + // addPageAt, and removePage safely update this, but directly | |
| 535 | + // manipulation of the pages three or pushing inheritable objects | |
| 536 | + // to the page level may invalidate it. See comments for | |
| 537 | + // updateAllPagesCache() for additional notes. Newer code should | |
| 538 | + // use QPDFPageDocumentHelper::getAllPages instead. The decision | |
| 539 | + // to expose this internal cache was arguably incorrect, but it is | |
| 539 | 540 | // being left here for compatibility. It is, however, completely |
| 540 | 541 | // safe to use this for files that you are not modifying. |
| 541 | 542 | QPDF_DLL |
| ... | ... | @@ -895,6 +896,10 @@ class QPDF |
| 895 | 896 | void getAllPagesInternal2(QPDFObjectHandle cur_pages, |
| 896 | 897 | std::vector<QPDFObjectHandle>& result, |
| 897 | 898 | std::set<QPDFObjGen>& visited); |
| 899 | + void getAllPagesInternal3(QPDFObjectHandle cur_pages, | |
| 900 | + std::vector<QPDFObjectHandle>& result, | |
| 901 | + std::set<QPDFObjGen>& visited, | |
| 902 | + std::set<QPDFObjGen>& seen); | |
| 898 | 903 | void insertPage(QPDFObjectHandle newpage, int pos); |
| 899 | 904 | int findPage(QPDFObjGen const& og); |
| 900 | 905 | int findPage(QPDFObjectHandle& page); | ... | ... |
libqpdf/QPDF_optimization.cc
| ... | ... | @@ -156,6 +156,9 @@ QPDF::pushInheritedAttributesToPage(bool allow_changes, bool warn_skipped_keys) |
| 156 | 156 | return; |
| 157 | 157 | } |
| 158 | 158 | |
| 159 | + // Calling getAllPages() resolves any duplicated page objects. | |
| 160 | + getAllPages(); | |
| 161 | + | |
| 159 | 162 | // key_ancestors is a mapping of page attribute keys to a stack of |
| 160 | 163 | // Pages nodes that contain values for them. |
| 161 | 164 | std::map<std::string, std::vector<QPDFObjectHandle> > key_ancestors; | ... | ... |
libqpdf/QPDF_pages.cc
| ... | ... | @@ -62,8 +62,18 @@ QPDF::getAllPagesInternal(QPDFObjectHandle cur_pages, |
| 62 | 62 | |
| 63 | 63 | void |
| 64 | 64 | QPDF::getAllPagesInternal2(QPDFObjectHandle cur_pages, |
| 65 | - std::vector<QPDFObjectHandle>& result, | |
| 66 | - std::set<QPDFObjGen>& visited) | |
| 65 | + std::vector<QPDFObjectHandle>& result, | |
| 66 | + std::set<QPDFObjGen>& visited) | |
| 67 | +{ | |
| 68 | + std::set<QPDFObjGen> seen; | |
| 69 | + getAllPagesInternal3(cur_pages, result, visited, seen); | |
| 70 | +} | |
| 71 | + | |
| 72 | +void | |
| 73 | +QPDF::getAllPagesInternal3(QPDFObjectHandle cur_pages, | |
| 74 | + std::vector<QPDFObjectHandle>& result, | |
| 75 | + std::set<QPDFObjGen>& visited, | |
| 76 | + std::set<QPDFObjGen>& seen) | |
| 67 | 77 | { |
| 68 | 78 | QPDFObjGen this_og = cur_pages.getObjGen(); |
| 69 | 79 | if (visited.count(this_og) > 0) |
| ... | ... | @@ -94,11 +104,21 @@ QPDF::getAllPagesInternal2(QPDFObjectHandle cur_pages, |
| 94 | 104 | int n = kids.getArrayNItems(); |
| 95 | 105 | for (int i = 0; i < n; ++i) |
| 96 | 106 | { |
| 97 | - getAllPagesInternal2(kids.getArrayItem(i), result, visited); | |
| 107 | + QPDFObjectHandle kid = kids.getArrayItem(i); | |
| 108 | + if (seen.count(kid.getObjGen())) | |
| 109 | + { | |
| 110 | + // Make a copy of the page. This does the same as | |
| 111 | + // shallowCopyPage in QPDFPageObjectHelper. | |
| 112 | + QTC::TC("qpdf", "QPDF resolve duplicated page object"); | |
| 113 | + kid = makeIndirectObject(QPDFObjectHandle(kid).shallowCopy()); | |
| 114 | + kids.setArrayItem(i, kid); | |
| 115 | + } | |
| 116 | + getAllPagesInternal3(kid, result, visited, seen); | |
| 98 | 117 | } |
| 99 | 118 | } |
| 100 | 119 | else if (type == "/Page") |
| 101 | 120 | { |
| 121 | + seen.insert(this_og); | |
| 102 | 122 | result.push_back(cur_pages); |
| 103 | 123 | } |
| 104 | 124 | else | ... | ... |
manual/qpdf-manual.xml
| ... | ... | @@ -4410,6 +4410,13 @@ print "\n"; |
| 4410 | 4410 | suite and properly handled. |
| 4411 | 4411 | </para> |
| 4412 | 4412 | </listitem> |
| 4413 | + <listitem> | |
| 4414 | + <para> | |
| 4415 | + Linearization and page manipulation APIs now detect and | |
| 4416 | + recover from files that have duplicate Page objects in the | |
| 4417 | + pages tree. | |
| 4418 | + </para> | |
| 4419 | + </listitem> | |
| 4413 | 4420 | </itemizedlist> |
| 4414 | 4421 | </listitem> |
| 4415 | 4422 | <listitem> | ... | ... |
qpdf/qpdf.testcov
qpdf/qtest/qpdf.test
| ... | ... | @@ -582,7 +582,7 @@ $td->runtest("check output", |
| 582 | 582 | {$td->FILE => "page_api_1-out2.pdf"}); |
| 583 | 583 | $td->runtest("duplicate page", |
| 584 | 584 | {$td->COMMAND => "test_driver 17 page_api_2.pdf"}, |
| 585 | - {$td->FILE => "page_api_2.out", $td->EXIT_STATUS => 2}, | |
| 585 | + {$td->FILE => "page_api_2.out", $td->EXIT_STATUS => 0}, | |
| 586 | 586 | $td->NORMALIZE_NEWLINES); |
| 587 | 587 | $td->runtest("delete and re-add a page", |
| 588 | 588 | {$td->COMMAND => "test_driver 18 page_api_1.pdf"}, |
| ... | ... | @@ -1729,6 +1729,30 @@ foreach my $f (qw(screen print)) |
| 1729 | 1729 | |
| 1730 | 1730 | show_ntests(); |
| 1731 | 1731 | # ---------- |
| 1732 | +$td->notify("--- Duplicated Page Object ---"); | |
| 1733 | +$n_tests += 4; | |
| 1734 | + | |
| 1735 | +$td->runtest("linearize duplicated pages", | |
| 1736 | + {$td->COMMAND => | |
| 1737 | + "qpdf --static-id --linearize" . | |
| 1738 | + " page_api_2.pdf a.pdf"}, | |
| 1739 | + {$td->STRING => "", $td->EXIT_STATUS => 0}, | |
| 1740 | + $td->NORMALIZE_NEWLINES); | |
| 1741 | +$td->runtest("compare files", | |
| 1742 | + {$td->FILE => "a.pdf"}, | |
| 1743 | + {$td->FILE => "linearize-duplicate-page.pdf"}); | |
| 1744 | +$td->runtest("extract duplicated pages", | |
| 1745 | + {$td->COMMAND => | |
| 1746 | + "qpdf --static-id page_api_2.pdf" . | |
| 1747 | + " --pages . -- a.pdf"}, | |
| 1748 | + {$td->STRING => "", $td->EXIT_STATUS => 0}, | |
| 1749 | + $td->NORMALIZE_NEWLINES); | |
| 1750 | +$td->runtest("compare files", | |
| 1751 | + {$td->FILE => "a.pdf"}, | |
| 1752 | + {$td->FILE => "extract-duplicate-page.pdf"}); | |
| 1753 | + | |
| 1754 | +show_ntests(); | |
| 1755 | +# ---------- | |
| 1732 | 1756 | $td->notify("--- Merging and Splitting ---"); |
| 1733 | 1757 | $n_tests += 22; |
| 1734 | 1758 | ... | ... |
qpdf/qtest/qpdf/extract-duplicate-page.pdf
0 → 100644
No preview for this file type
qpdf/qtest/qpdf/linearize-duplicate-page.pdf
0 → 100644
No preview for this file type
qpdf/qtest/qpdf/page_api_2.out
qpdf/test_driver.cc
| ... | ... | @@ -923,11 +923,24 @@ void runtest(int n, char const* filename1, char const* arg2) |
| 923 | 923 | } |
| 924 | 924 | else if (n == 17) |
| 925 | 925 | { |
| 926 | - // The input file to this test case is broken to exercise an | |
| 927 | - // error condition. | |
| 926 | + // The input file to this test case has a duplicated page. | |
| 927 | + QPDFObjectHandle page_kids = | |
| 928 | + pdf.getRoot().getKey("/Pages").getKey("/Kids"); | |
| 929 | + assert(page_kids.getArrayItem(0).getObjGen() == | |
| 930 | + page_kids.getArrayItem(1).getObjGen()); | |
| 928 | 931 | std::vector<QPDFObjectHandle> const& pages = pdf.getAllPages(); |
| 932 | + assert(pages.size() == 3); | |
| 933 | + assert(! (pages.at(0).getObjGen() == pages.at(1).getObjGen())); | |
| 934 | + assert(QPDFObjectHandle(pages.at(0)).getKey("/Contents").getObjGen() == | |
| 935 | + QPDFObjectHandle(pages.at(1)).getKey("/Contents").getObjGen()); | |
| 929 | 936 | pdf.removePage(pages.at(0)); |
| 930 | - std::cout << "you can't see this" << std::endl; | |
| 937 | + assert(pages.size() == 2); | |
| 938 | + PointerHolder<Buffer> b = QPDFObjectHandle(pages.at(0)). | |
| 939 | + getKey("/Contents").getStreamData(); | |
| 940 | + std::string contents = std::string( | |
| 941 | + reinterpret_cast<char const*>(b->getBuffer()), | |
| 942 | + b->getSize()); | |
| 943 | + assert(contents.find("page 0") != std::string::npos); | |
| 931 | 944 | } |
| 932 | 945 | else if (n == 18) |
| 933 | 946 | { | ... | ... |