Commit eb802cfa8c7109504ad10bf4c89c47c876d9a382

Authored by Jay Berkenbilt
1 parent e01ae196

Implement page manipulation APIs

include/qpdf/Constants.h
... ... @@ -22,7 +22,8 @@ enum qpdf_error_code_e
22 22 qpdf_e_system, /* I/O error, memory error, etc. */
23 23 qpdf_e_unsupported, /* PDF feature not (yet) supported by qpdf */
24 24 qpdf_e_password, /* incorrect password for encrypted file */
25   - qpdf_e_damaged_pdf /* syntax errors or other damage in PDF */
  25 + qpdf_e_damaged_pdf, /* syntax errors or other damage in PDF */
  26 + qpdf_e_pages, /* erroneous or unsupported pages structure */
26 27 };
27 28  
28 29 /* Write Parameters */
... ...
include/qpdf/QPDF.hh
... ... @@ -340,14 +340,26 @@ class QPDF
340 340 // Convenience routines for common functions. See also
341 341 // QPDFObjectHandle.hh for additional convenience routines.
342 342  
343   - // Traverse page tree return all /Page objects.
  343 + // Page handling API
  344 +
  345 + // Traverse page tree return all /Page objects. Note that calls
  346 + // to page manipulation APIs will change the internal vector that
  347 + // this routine returns a pointer to. If you don't want that,
  348 + // assign this to a regular vector rather than a const reference.
344 349 QPDF_DLL
345 350 std::vector<QPDFObjectHandle> const& getAllPages();
346 351  
347   - // QPDF internally caches the /Pages tree. This method will clear
348   - // the cache when e.g. direct modifications have been made.
  352 + // This method synchronizes QPDF's cache of the page structure
  353 + // with the actual /Pages tree. If you restrict changes to the
  354 + // /Pages tree, including addition, removal, or replacement of
  355 + // pages or changes to any /Pages objects, to calls to these page
  356 + // handling APIs, you never need to call this method. If you
  357 + // modify /Pages structures directly, you must call this method
  358 + // afterwards. This method updates the internal list of pages, so
  359 + // after calling this method, any previous references returned by
  360 + // getAllPages() will be valid again.
349 361 QPDF_DLL
350   - void clearPagesCache();
  362 + void updateAllPagesCache();
351 363  
352 364 // Add new page at the beginning or the end of the current pdf
353 365 QPDF_DLL
... ... @@ -356,11 +368,11 @@ class QPDF
356 368 // Add new page before or after refpage
357 369 QPDF_DLL
358 370 void addPageAt(QPDFObjectHandle newpage, bool before,
359   - QPDFObjectHandle const& refpage);
  371 + QPDFObjectHandle refpage);
360 372  
361   - // Remove pageoh from the pdf.
  373 + // Remove page from the pdf.
362 374 QPDF_DLL
363   - void removePage(QPDFObjectHandle const& pageoh);
  375 + void removePage(QPDFObjectHandle page);
364 376  
365 377 // Resolver class is restricted to QPDFObjectHandle so that only
366 378 // it can resolve indirect references.
... ... @@ -541,12 +553,12 @@ class QPDF
541 553  
542 554 void getAllPagesInternal(QPDFObjectHandle cur_pages,
543 555 std::vector<QPDFObjectHandle>& result);
544   - // creates pageobj_to_pages_pos if necessary
545   - // returns position, or -1 if not found
  556 + void insertPage(QPDFObjectHandle newpage, int pos);
546 557 int findPage(int objid, int generation);
547   - int findPage(QPDFObjectHandle const& pageoh); // convenience
548   -
  558 + int findPage(QPDFObjectHandle& page);
549 559 void flattenPagesTree();
  560 + void insertPageobjToPage(QPDFObjectHandle const& obj, int pos,
  561 + bool check_duplicate);
550 562  
551 563 // methods to support encryption -- implemented in QPDF_encryption.cc
552 564 encryption_method_e interpretCF(QPDFObjectHandle);
... ...
libqpdf/QPDF_pages.cc
... ... @@ -6,6 +6,40 @@
6 6 #include <qpdf/QUtil.hh>
7 7 #include <qpdf/QPDFExc.hh>
8 8  
  9 +// In support of page manipulation APIs, these methods internally
  10 +// maintain state about pages in a pair of data structures: all_pages,
  11 +// which is a vector of page objects, and pageobj_to_pages_pos, which
  12 +// maps a page object to its position in the all_pages array.
  13 +// Unfortunately, the getAllPages() method returns a const reference
  14 +// to all_pages and has been in the public API long before the
  15 +// introduction of mutation APIs, so we're pretty much stuck with it.
  16 +// Anyway, there are lots of calls to it in the library, so the
  17 +// efficiency of having it cached is probably worth keeping it.
  18 +
  19 +// The goal of this code is to ensure that the all_pages vector, which
  20 +// users may have a reference to, and the pageobj_to_pages_pos map,
  21 +// which users will not have access to, remain consistent outside of
  22 +// any call to the library. As long as users only touch the /Pages
  23 +// structure through page-specific API calls, they never have to worry
  24 +// about anything, and this will also stay consistent. If a user
  25 +// touches anything about the /Pages structure outside of these calls
  26 +// (such as by directly looking up and manipulating the underlying
  27 +// objects), they can call updatePagesCache() to bring things back in
  28 +// sync.
  29 +
  30 +// If the user doesn't ever use the page manipulation APIs, then qpdf
  31 +// leaves the /Pages structure alone. If the user does use the APIs,
  32 +// then we push all inheritable objects down and flatten the /Pages
  33 +// tree. This makes it easier for us to keep /Pages, all_pages, and
  34 +// pageobj_to_pages_pos internally consistent at all times.
  35 +
  36 +// Responsibility for keeping all_pages, pageobj_to_pages_pos, and the
  37 +// Pages structure consistent should remain in as few places as
  38 +// possible. As of initial writing, only flattenPagesTree,
  39 +// insertPage, and removePage, along with methods they call, are
  40 +// concerned with it. Everything else goes through one of those
  41 +// methods.
  42 +
9 43 std::vector<QPDFObjectHandle> const&
10 44 QPDF::getAllPages()
11 45 {
... ... @@ -44,152 +78,173 @@ QPDF::getAllPagesInternal(QPDFObjectHandle cur_pages,
44 78 }
45 79 }
46 80  
47   -// FIXXX here down
48   -
49 81 void
50   -QPDF::clearPagesCache()
  82 +QPDF::updateAllPagesCache()
51 83 {
  84 + // Force regeneration of the pages cache. We force immediate
  85 + // recalculation of all_pages since users may have references to
  86 + // it that they got from calls to getAllPages(). We can defer
  87 + // recalculation of pageobj_to_pages_pos until needed.
  88 + QTC::TC("qpdf", "QPDF updateAllPagesCache");
52 89 this->all_pages.clear();
53 90 this->pageobj_to_pages_pos.clear();
  91 + getAllPages();
54 92 }
55 93  
56 94 void
57 95 QPDF::flattenPagesTree()
58 96 {
59   - clearPagesCache();
  97 + // If not already done, flatten the /Pages structure and
  98 + // initialize pageobj_to_pages_pos.
60 99  
61   - // FIXME: more specific method, we don't want to generate the extra stuff.
62   - // We also need cheap fixup after addPage/removePage.
  100 + if (! this->pageobj_to_pages_pos.empty())
  101 + {
  102 + return;
  103 + }
63 104  
64   - // no compressed objects to be produced here...
65   - std::map<int, int> object_stream_data;
66   - optimize(object_stream_data); // push down inheritance
  105 + // Push inherited objects down to the /Page level
  106 + optimizePagesTree(true);
  107 + getAllPages();
67 108  
68   - std::vector<QPDFObjectHandle> kids = this->getAllPages();
69 109 QPDFObjectHandle pages = this->trailer.getKey("/Root").getKey("/Pages");
70 110  
71   - const int len = kids.size();
  111 + int const len = (int)this->all_pages.size();
72 112 for (int pos = 0; pos < len; ++pos)
73 113 {
74   - // populate pageobj_to_pages_pos
75   - ObjGen og(kids[pos].getObjectID(), kids[pos].getGeneration());
76   - if (! this->pageobj_to_pages_pos.insert(std::make_pair(og, pos)).second)
77   - {
78   - // insert failed: duplicate entry found
79   - *out_stream << "WARNING: duplicate page reference found, "
80   - << "but currently not fully supported." << std::endl;
81   - }
82   -
83   - // fix parent links
84   - kids[pos].replaceKey("/Parent", pages);
  114 + // populate pageobj_to_pages_pos and fix parent pointer
  115 + insertPageobjToPage(this->all_pages[pos], pos, true);
  116 + this->all_pages[pos].replaceKey("/Parent", pages);
85 117 }
86 118  
87   - pages.replaceKey("/Kids", QPDFObjectHandle::newArray(kids));
  119 + pages.replaceKey("/Kids", QPDFObjectHandle::newArray(this->all_pages));
88 120 // /Count has not changed
89 121 assert(pages.getKey("/Count").getIntValue() == len);
90 122 }
91 123  
92   -int
93   -QPDF::findPage(int objid, int generation)
94   -{
95   - if (this->pageobj_to_pages_pos.empty())
96   - {
97   - flattenPagesTree();
98   - }
99   - std::map<ObjGen, int>::iterator it =
100   - this->pageobj_to_pages_pos.find(ObjGen(objid, generation));
101   - if (it != this->pageobj_to_pages_pos.end())
102   - {
103   - return (*it).second;
104   - }
105   - return -1; // throw?
106   -}
107   -
108   -int
109   -QPDF::findPage(QPDFObjectHandle const& pageoh)
  124 +void
  125 +QPDF::insertPageobjToPage(QPDFObjectHandle const& obj, int pos,
  126 + bool check_duplicate)
110 127 {
111   - if (!pageoh.isInitialized())
  128 + ObjGen og(obj.getObjectID(), obj.getGeneration());
  129 + bool duplicate =
  130 + (! this->pageobj_to_pages_pos.insert(std::make_pair(og, pos)).second);
  131 + if (duplicate && check_duplicate)
112 132 {
113   - return -1;
114   - // TODO? throw
  133 + QTC::TC("qpdf", "QPDF duplicate page reference");
  134 + setLastObjectDescription("page " + QUtil::int_to_string(pos) +
  135 + " (numbered from zero)",
  136 + og.obj, og.gen);
  137 + throw QPDFExc(qpdf_e_pages, this->file->getName(),
  138 + this->last_object_description, 0,
  139 + "duplicate page reference found;"
  140 + " this would cause loss of data");
115 141 }
116   - return findPage(pageoh.getObjectID(), pageoh.getGeneration());
117 142 }
118 143  
119 144 void
120   -QPDF::addPage(QPDFObjectHandle newpage, bool first)
  145 +QPDF::insertPage(QPDFObjectHandle newpage, int pos)
121 146 {
122   - if (this->pageobj_to_pages_pos.empty())
123   - {
124   - flattenPagesTree();
125   - }
  147 + // pos is numbered from 0, so pos = 0 inserts at the begining and
  148 + // pos = npages adds to the end.
126 149  
127   - newpage.assertPageObject(); // FIXME: currently private
  150 + flattenPagesTree();
  151 + newpage.assertPageObject();
  152 +
  153 + QTC::TC("qpdf", "QPDF insert page",
  154 + (pos == 0) ? 0 : // insert at beginning
  155 + (pos == ((int)this->all_pages.size())) ? 1 : // insert at end
  156 + 2); // insert in middle
128 157  
129 158 QPDFObjectHandle pages = this->trailer.getKey("/Root").getKey("/Pages");
130 159 QPDFObjectHandle kids = pages.getKey("/Kids");
  160 + assert ((pos >= 0) && (pos <= (int)this->all_pages.size()));
131 161  
132 162 newpage.replaceKey("/Parent", pages);
133   - if (first)
134   - {
135   - kids.insertItem(0, newpage);
136   - }
137   - else
  163 + kids.insertItem(pos, newpage);
  164 + int npages = kids.getArrayNItems();
  165 + pages.replaceKey("/Count", QPDFObjectHandle::newInteger(npages));
  166 + this->all_pages.insert(this->all_pages.begin() + pos, newpage);
  167 + assert((int)this->all_pages.size() == npages);
  168 + for (int i = pos + 1; i < npages; ++i)
138 169 {
139   - kids.appendItem(newpage);
  170 + insertPageobjToPage(this->all_pages[i], i, false);
140 171 }
141   - pages.replaceKey("/Count",
142   - QPDFObjectHandle::newInteger(kids.getArrayNItems()));
143   -
144   - // FIXME: this is overkill, but cache is now stale
145   - clearPagesCache();
  172 + insertPageobjToPage(newpage, pos, true);
  173 + assert((int)this->pageobj_to_pages_pos.size() == npages);
146 174 }
147 175  
148 176 void
149   -QPDF::addPageAt(QPDFObjectHandle newpage, bool before,
150   - QPDFObjectHandle const &refpage)
  177 +QPDF::removePage(QPDFObjectHandle page)
151 178 {
152   - int refpos = findPage(refpage); // also ensures flat /Pages
153   - if (refpos == -1)
154   - {
155   - throw "Could not find refpage";
156   - }
157   -
158   - newpage.assertPageObject();
  179 + int pos = findPage(page); // also ensures flat /Pages
  180 + QTC::TC("qpdf", "QPDF remove page",
  181 + (pos == 0) ? 0 : // remove at beginning
  182 + (pos == ((int)this->all_pages.size() - 1)) ? 1 : // remove at end
  183 + 2); // remove in middle
159 184  
160 185 QPDFObjectHandle pages = this->trailer.getKey("/Root").getKey("/Pages");
161 186 QPDFObjectHandle kids = pages.getKey("/Kids");
162 187  
  188 + kids.eraseItem(pos);
  189 + int npages = kids.getArrayNItems();
  190 + pages.replaceKey("/Count", QPDFObjectHandle::newInteger(npages));
  191 + this->all_pages.erase(this->all_pages.begin() + pos);
  192 + assert((int)this->all_pages.size() == npages);
  193 + this->pageobj_to_pages_pos.erase(
  194 + ObjGen(page.getObjectID(), page.getGeneration()));
  195 + assert((int)this->pageobj_to_pages_pos.size() == npages);
  196 + for (int i = pos; i < npages; ++i)
  197 + {
  198 + insertPageobjToPage(this->all_pages[i], i, false);
  199 + }
  200 +}
  201 +
  202 +void
  203 +QPDF::addPageAt(QPDFObjectHandle newpage, bool before,
  204 + QPDFObjectHandle refpage)
  205 +{
  206 + int refpos = findPage(refpage);
163 207 if (! before)
164 208 {
165 209 ++refpos;
166 210 }
167   -
168   - newpage.replaceKey("/Parent", pages);
169   - kids.insertItem(refpos, newpage);
170   - pages.replaceKey("/Count",
171   - QPDFObjectHandle::newInteger(kids.getArrayNItems()));
172   -
173   - // FIXME: this is overkill, but cache is now stale
174   - clearPagesCache();
  211 + insertPage(newpage, refpos);
175 212 }
176 213  
  214 +
177 215 void
178   -QPDF::removePage(QPDFObjectHandle const& pageoh)
  216 +QPDF::addPage(QPDFObjectHandle newpage, bool first)
179 217 {
180   - int pos = findPage(pageoh); // also ensures flat /Pages
181   - if (pos == -1)
  218 + getAllPages();
  219 + if (first)
182 220 {
183   - throw "Can't remove non-existing page";
  221 + insertPage(newpage, 0);
184 222 }
  223 + else
  224 + {
  225 + insertPage(newpage, (int)this->all_pages.size());
  226 + }
  227 +}
185 228  
186   - QPDFObjectHandle pages = this->trailer.getKey("/Root").getKey("/Pages");
187   - QPDFObjectHandle kids = pages.getKey("/Kids");
188   -
189   - kids.eraseItem(pos);
190   - pages.replaceKey("/Count",
191   - QPDFObjectHandle::newInteger(kids.getArrayNItems()));
  229 +int
  230 +QPDF::findPage(QPDFObjectHandle& page)
  231 +{
  232 + page.assertPageObject();
  233 + return findPage(page.getObjectID(), page.getGeneration());
  234 +}
192 235  
193   - // FIXME: this is overkill, but cache is now stale
194   - clearPagesCache();
  236 +int
  237 +QPDF::findPage(int objid, int generation)
  238 +{
  239 + flattenPagesTree();
  240 + std::map<ObjGen, int>::iterator it =
  241 + this->pageobj_to_pages_pos.find(ObjGen(objid, generation));
  242 + if (it == this->pageobj_to_pages_pos.end())
  243 + {
  244 + setLastObjectDescription("page object", objid, generation);
  245 + QPDFExc(qpdf_e_pages, this->file->getName(),
  246 + this->last_object_description, 0,
  247 + "page object not referenced in /Pages tree");
  248 + }
  249 + return (*it).second;
195 250 }
... ...
qpdf/qpdf.testcov
... ... @@ -203,3 +203,7 @@ qpdf-c called qpdf_init_write_memory 0
203 203 exercise processFile(name) 0
204 204 exercise processFile(FILE*) 0
205 205 exercise processMemoryFile 0
  206 +QPDF duplicate page reference 0
  207 +QPDF remove page 2
  208 +QPDF insert page 2
  209 +QPDF updateAllPagesCache 0
... ...