Commit 52f9d326a56e6409a1487c724241f91de33e3ba6

Authored by Jay Berkenbilt
1 parent 9e01c8bd

Resolve duplicated page objects (fixes #268)

When linearizing a file or getting the list of all pages in a file,
detect if the pages tree contains a duplicated page object and, if so,
shallow copy it. This makes it possible to have a one to one mapping
of page positions to page objects.
ChangeLog
  1 +2019-01-28 Jay Berkenbilt <ejb@ql.org>
  2 +
  3 + * When linearizing or getting the list of all pages in a file,
  4 + replace duplicated page objects with a shallow copy of the page
  5 + object. Linearization and all page manipulation APIs require page
  6 + objects to be unique. Pages that were originally duplicated will
  7 + still share contents and any other indirect resources. Fixes #268.
  8 +
1 9 2019-01-26 Jay Berkenbilt <ejb@ql.org>
2 10  
3 11 * Add --overlay and --underlay options. Fixes #207.
... ...
include/qpdf/QPDF.hh
... ... @@ -527,15 +527,16 @@ class QPDF
527 527 void optimize(std::map<int, int> const& object_stream_data,
528 528 bool allow_changes = true);
529 529  
530   - // Traverse page tree return all /Page objects. For efficiency,
531   - // this method returns a const reference to an internal vector of
532   - // pages. Calls to addPage, addPageAt, and removePage safely
533   - // update this, but directly manipulation of the pages three or
534   - // pushing inheritable objects to the page level may invalidate
535   - // it. See comments for updateAllPagesCache() for additional
536   - // notes. Newer code should use
537   - // QPDFPageDocumentHelper::getAllPages instead. The decision to
538   - // expose this internal cache was arguably incorrect, but it is
  530 + // Traverse page tree return all /Page objects. It also detects
  531 + // and resolves cases in which the same /Page object is
  532 + // duplicated. For efficiency, this method returns a const
  533 + // reference to an internal vector of pages. Calls to addPage,
  534 + // addPageAt, and removePage safely update this, but directly
  535 + // manipulation of the pages three or pushing inheritable objects
  536 + // to the page level may invalidate it. See comments for
  537 + // updateAllPagesCache() for additional notes. Newer code should
  538 + // use QPDFPageDocumentHelper::getAllPages instead. The decision
  539 + // to expose this internal cache was arguably incorrect, but it is
539 540 // being left here for compatibility. It is, however, completely
540 541 // safe to use this for files that you are not modifying.
541 542 QPDF_DLL
... ... @@ -895,6 +896,10 @@ class QPDF
895 896 void getAllPagesInternal2(QPDFObjectHandle cur_pages,
896 897 std::vector<QPDFObjectHandle>& result,
897 898 std::set<QPDFObjGen>& visited);
  899 + void getAllPagesInternal3(QPDFObjectHandle cur_pages,
  900 + std::vector<QPDFObjectHandle>& result,
  901 + std::set<QPDFObjGen>& visited,
  902 + std::set<QPDFObjGen>& seen);
898 903 void insertPage(QPDFObjectHandle newpage, int pos);
899 904 int findPage(QPDFObjGen const& og);
900 905 int findPage(QPDFObjectHandle& page);
... ...
libqpdf/QPDF_optimization.cc
... ... @@ -156,6 +156,9 @@ QPDF::pushInheritedAttributesToPage(bool allow_changes, bool warn_skipped_keys)
156 156 return;
157 157 }
158 158  
  159 + // Calling getAllPages() resolves any duplicated page objects.
  160 + getAllPages();
  161 +
159 162 // key_ancestors is a mapping of page attribute keys to a stack of
160 163 // Pages nodes that contain values for them.
161 164 std::map<std::string, std::vector<QPDFObjectHandle> > key_ancestors;
... ...
libqpdf/QPDF_pages.cc
... ... @@ -62,8 +62,18 @@ QPDF::getAllPagesInternal(QPDFObjectHandle cur_pages,
62 62  
63 63 void
64 64 QPDF::getAllPagesInternal2(QPDFObjectHandle cur_pages,
65   - std::vector<QPDFObjectHandle>& result,
66   - std::set<QPDFObjGen>& visited)
  65 + std::vector<QPDFObjectHandle>& result,
  66 + std::set<QPDFObjGen>& visited)
  67 +{
  68 + std::set<QPDFObjGen> seen;
  69 + getAllPagesInternal3(cur_pages, result, visited, seen);
  70 +}
  71 +
  72 +void
  73 +QPDF::getAllPagesInternal3(QPDFObjectHandle cur_pages,
  74 + std::vector<QPDFObjectHandle>& result,
  75 + std::set<QPDFObjGen>& visited,
  76 + std::set<QPDFObjGen>& seen)
67 77 {
68 78 QPDFObjGen this_og = cur_pages.getObjGen();
69 79 if (visited.count(this_og) > 0)
... ... @@ -94,11 +104,21 @@ QPDF::getAllPagesInternal2(QPDFObjectHandle cur_pages,
94 104 int n = kids.getArrayNItems();
95 105 for (int i = 0; i < n; ++i)
96 106 {
97   - getAllPagesInternal2(kids.getArrayItem(i), result, visited);
  107 + QPDFObjectHandle kid = kids.getArrayItem(i);
  108 + if (seen.count(kid.getObjGen()))
  109 + {
  110 + // Make a copy of the page. This does the same as
  111 + // shallowCopyPage in QPDFPageObjectHelper.
  112 + QTC::TC("qpdf", "QPDF resolve duplicated page object");
  113 + kid = makeIndirectObject(QPDFObjectHandle(kid).shallowCopy());
  114 + kids.setArrayItem(i, kid);
  115 + }
  116 + getAllPagesInternal3(kid, result, visited, seen);
98 117 }
99 118 }
100 119 else if (type == "/Page")
101 120 {
  121 + seen.insert(this_og);
102 122 result.push_back(cur_pages);
103 123 }
104 124 else
... ...
manual/qpdf-manual.xml
... ... @@ -4410,6 +4410,13 @@ print &quot;\n&quot;;
4410 4410 suite and properly handled.
4411 4411 </para>
4412 4412 </listitem>
  4413 + <listitem>
  4414 + <para>
  4415 + Linearization and page manipulation APIs now detect and
  4416 + recover from files that have duplicate Page objects in the
  4417 + pages tree.
  4418 + </para>
  4419 + </listitem>
4413 4420 </itemizedlist>
4414 4421 </listitem>
4415 4422 <listitem>
... ...
qpdf/qpdf.testcov
... ... @@ -428,3 +428,4 @@ QPDFFormFieldObjectHelper fallback Tf 0
428 428 QPDFPageObjectHelper non-trivial inheritance 0
429 429 QPDFPageObjectHelper copy shared attribute 0
430 430 qpdf from_nr from repeat_nr 0
  431 +QPDF resolve duplicated page object 0
... ...
qpdf/qtest/qpdf.test
... ... @@ -582,7 +582,7 @@ $td-&gt;runtest(&quot;check output&quot;,
582 582 {$td->FILE => "page_api_1-out2.pdf"});
583 583 $td->runtest("duplicate page",
584 584 {$td->COMMAND => "test_driver 17 page_api_2.pdf"},
585   - {$td->FILE => "page_api_2.out", $td->EXIT_STATUS => 2},
  585 + {$td->FILE => "page_api_2.out", $td->EXIT_STATUS => 0},
586 586 $td->NORMALIZE_NEWLINES);
587 587 $td->runtest("delete and re-add a page",
588 588 {$td->COMMAND => "test_driver 18 page_api_1.pdf"},
... ... @@ -1729,6 +1729,30 @@ foreach my $f (qw(screen print))
1729 1729  
1730 1730 show_ntests();
1731 1731 # ----------
  1732 +$td->notify("--- Duplicated Page Object ---");
  1733 +$n_tests += 4;
  1734 +
  1735 +$td->runtest("linearize duplicated pages",
  1736 + {$td->COMMAND =>
  1737 + "qpdf --static-id --linearize" .
  1738 + " page_api_2.pdf a.pdf"},
  1739 + {$td->STRING => "", $td->EXIT_STATUS => 0},
  1740 + $td->NORMALIZE_NEWLINES);
  1741 +$td->runtest("compare files",
  1742 + {$td->FILE => "a.pdf"},
  1743 + {$td->FILE => "linearize-duplicate-page.pdf"});
  1744 +$td->runtest("extract duplicated pages",
  1745 + {$td->COMMAND =>
  1746 + "qpdf --static-id page_api_2.pdf" .
  1747 + " --pages . -- a.pdf"},
  1748 + {$td->STRING => "", $td->EXIT_STATUS => 0},
  1749 + $td->NORMALIZE_NEWLINES);
  1750 +$td->runtest("compare files",
  1751 + {$td->FILE => "a.pdf"},
  1752 + {$td->FILE => "extract-duplicate-page.pdf"});
  1753 +
  1754 +show_ntests();
  1755 +# ----------
1732 1756 $td->notify("--- Merging and Splitting ---");
1733 1757 $n_tests += 22;
1734 1758  
... ...
qpdf/qtest/qpdf/extract-duplicate-page.pdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/linearize-duplicate-page.pdf 0 → 100644
No preview for this file type
qpdf/qtest/qpdf/page_api_2.out
1   -page_api_2.pdf (page 1 (numbered from zero): object 4 0): duplicate page reference found; this would cause loss of data
  1 +test 17 done
... ...
qpdf/test_driver.cc
... ... @@ -923,11 +923,24 @@ void runtest(int n, char const* filename1, char const* arg2)
923 923 }
924 924 else if (n == 17)
925 925 {
926   - // The input file to this test case is broken to exercise an
927   - // error condition.
  926 + // The input file to this test case has a duplicated page.
  927 + QPDFObjectHandle page_kids =
  928 + pdf.getRoot().getKey("/Pages").getKey("/Kids");
  929 + assert(page_kids.getArrayItem(0).getObjGen() ==
  930 + page_kids.getArrayItem(1).getObjGen());
928 931 std::vector<QPDFObjectHandle> const& pages = pdf.getAllPages();
  932 + assert(pages.size() == 3);
  933 + assert(! (pages.at(0).getObjGen() == pages.at(1).getObjGen()));
  934 + assert(QPDFObjectHandle(pages.at(0)).getKey("/Contents").getObjGen() ==
  935 + QPDFObjectHandle(pages.at(1)).getKey("/Contents").getObjGen());
929 936 pdf.removePage(pages.at(0));
930   - std::cout << "you can't see this" << std::endl;
  937 + assert(pages.size() == 2);
  938 + PointerHolder<Buffer> b = QPDFObjectHandle(pages.at(0)).
  939 + getKey("/Contents").getStreamData();
  940 + std::string contents = std::string(
  941 + reinterpret_cast<char const*>(b->getBuffer()),
  942 + b->getSize());
  943 + assert(contents.find("page 0") != std::string::npos);
931 944 }
932 945 else if (n == 18)
933 946 {
... ...