Commit eb802cfa8c7109504ad10bf4c89c47c876d9a382

Authored by Jay Berkenbilt
1 parent e01ae196

Implement page manipulation APIs

include/qpdf/Constants.h
@@ -22,7 +22,8 @@ enum qpdf_error_code_e @@ -22,7 +22,8 @@ enum qpdf_error_code_e
22 qpdf_e_system, /* I/O error, memory error, etc. */ 22 qpdf_e_system, /* I/O error, memory error, etc. */
23 qpdf_e_unsupported, /* PDF feature not (yet) supported by qpdf */ 23 qpdf_e_unsupported, /* PDF feature not (yet) supported by qpdf */
24 qpdf_e_password, /* incorrect password for encrypted file */ 24 qpdf_e_password, /* incorrect password for encrypted file */
25 - qpdf_e_damaged_pdf /* syntax errors or other damage in PDF */ 25 + qpdf_e_damaged_pdf, /* syntax errors or other damage in PDF */
  26 + qpdf_e_pages, /* erroneous or unsupported pages structure */
26 }; 27 };
27 28
28 /* Write Parameters */ 29 /* Write Parameters */
include/qpdf/QPDF.hh
@@ -340,14 +340,26 @@ class QPDF @@ -340,14 +340,26 @@ class QPDF
340 // Convenience routines for common functions. See also 340 // Convenience routines for common functions. See also
341 // QPDFObjectHandle.hh for additional convenience routines. 341 // QPDFObjectHandle.hh for additional convenience routines.
342 342
343 - // Traverse page tree return all /Page objects. 343 + // Page handling API
  344 +
  345 + // Traverse page tree return all /Page objects. Note that calls
  346 + // to page manipulation APIs will change the internal vector that
  347 + // this routine returns a pointer to. If you don't want that,
  348 + // assign this to a regular vector rather than a const reference.
344 QPDF_DLL 349 QPDF_DLL
345 std::vector<QPDFObjectHandle> const& getAllPages(); 350 std::vector<QPDFObjectHandle> const& getAllPages();
346 351
347 - // QPDF internally caches the /Pages tree. This method will clear  
348 - // the cache when e.g. direct modifications have been made. 352 + // This method synchronizes QPDF's cache of the page structure
  353 + // with the actual /Pages tree. If you restrict changes to the
  354 + // /Pages tree, including addition, removal, or replacement of
  355 + // pages or changes to any /Pages objects, to calls to these page
  356 + // handling APIs, you never need to call this method. If you
  357 + // modify /Pages structures directly, you must call this method
  358 + // afterwards. This method updates the internal list of pages, so
  359 + // after calling this method, any previous references returned by
  360 + // getAllPages() will be valid again.
349 QPDF_DLL 361 QPDF_DLL
350 - void clearPagesCache(); 362 + void updateAllPagesCache();
351 363
352 // Add new page at the beginning or the end of the current pdf 364 // Add new page at the beginning or the end of the current pdf
353 QPDF_DLL 365 QPDF_DLL
@@ -356,11 +368,11 @@ class QPDF @@ -356,11 +368,11 @@ class QPDF
356 // Add new page before or after refpage 368 // Add new page before or after refpage
357 QPDF_DLL 369 QPDF_DLL
358 void addPageAt(QPDFObjectHandle newpage, bool before, 370 void addPageAt(QPDFObjectHandle newpage, bool before,
359 - QPDFObjectHandle const& refpage); 371 + QPDFObjectHandle refpage);
360 372
361 - // Remove pageoh from the pdf. 373 + // Remove page from the pdf.
362 QPDF_DLL 374 QPDF_DLL
363 - void removePage(QPDFObjectHandle const& pageoh); 375 + void removePage(QPDFObjectHandle page);
364 376
365 // Resolver class is restricted to QPDFObjectHandle so that only 377 // Resolver class is restricted to QPDFObjectHandle so that only
366 // it can resolve indirect references. 378 // it can resolve indirect references.
@@ -541,12 +553,12 @@ class QPDF @@ -541,12 +553,12 @@ class QPDF
541 553
542 void getAllPagesInternal(QPDFObjectHandle cur_pages, 554 void getAllPagesInternal(QPDFObjectHandle cur_pages,
543 std::vector<QPDFObjectHandle>& result); 555 std::vector<QPDFObjectHandle>& result);
544 - // creates pageobj_to_pages_pos if necessary  
545 - // returns position, or -1 if not found 556 + void insertPage(QPDFObjectHandle newpage, int pos);
546 int findPage(int objid, int generation); 557 int findPage(int objid, int generation);
547 - int findPage(QPDFObjectHandle const& pageoh); // convenience  
548 - 558 + int findPage(QPDFObjectHandle& page);
549 void flattenPagesTree(); 559 void flattenPagesTree();
  560 + void insertPageobjToPage(QPDFObjectHandle const& obj, int pos,
  561 + bool check_duplicate);
550 562
551 // methods to support encryption -- implemented in QPDF_encryption.cc 563 // methods to support encryption -- implemented in QPDF_encryption.cc
552 encryption_method_e interpretCF(QPDFObjectHandle); 564 encryption_method_e interpretCF(QPDFObjectHandle);
libqpdf/QPDF_pages.cc
@@ -6,6 +6,40 @@ @@ -6,6 +6,40 @@
6 #include <qpdf/QUtil.hh> 6 #include <qpdf/QUtil.hh>
7 #include <qpdf/QPDFExc.hh> 7 #include <qpdf/QPDFExc.hh>
8 8
  9 +// In support of page manipulation APIs, these methods internally
  10 +// maintain state about pages in a pair of data structures: all_pages,
  11 +// which is a vector of page objects, and pageobj_to_pages_pos, which
  12 +// maps a page object to its position in the all_pages array.
  13 +// Unfortunately, the getAllPages() method returns a const reference
  14 +// to all_pages and has been in the public API long before the
  15 +// introduction of mutation APIs, so we're pretty much stuck with it.
  16 +// Anyway, there are lots of calls to it in the library, so the
  17 +// efficiency of having it cached is probably worth keeping it.
  18 +
  19 +// The goal of this code is to ensure that the all_pages vector, which
  20 +// users may have a reference to, and the pageobj_to_pages_pos map,
  21 +// which users will not have access to, remain consistent outside of
  22 +// any call to the library. As long as users only touch the /Pages
  23 +// structure through page-specific API calls, they never have to worry
  24 +// about anything, and this will also stay consistent. If a user
  25 +// touches anything about the /Pages structure outside of these calls
  26 +// (such as by directly looking up and manipulating the underlying
  27 +// objects), they can call updatePagesCache() to bring things back in
  28 +// sync.
  29 +
  30 +// If the user doesn't ever use the page manipulation APIs, then qpdf
  31 +// leaves the /Pages structure alone. If the user does use the APIs,
  32 +// then we push all inheritable objects down and flatten the /Pages
  33 +// tree. This makes it easier for us to keep /Pages, all_pages, and
  34 +// pageobj_to_pages_pos internally consistent at all times.
  35 +
  36 +// Responsibility for keeping all_pages, pageobj_to_pages_pos, and the
  37 +// Pages structure consistent should remain in as few places as
  38 +// possible. As of initial writing, only flattenPagesTree,
  39 +// insertPage, and removePage, along with methods they call, are
  40 +// concerned with it. Everything else goes through one of those
  41 +// methods.
  42 +
9 std::vector<QPDFObjectHandle> const& 43 std::vector<QPDFObjectHandle> const&
10 QPDF::getAllPages() 44 QPDF::getAllPages()
11 { 45 {
@@ -44,152 +78,173 @@ QPDF::getAllPagesInternal(QPDFObjectHandle cur_pages, @@ -44,152 +78,173 @@ QPDF::getAllPagesInternal(QPDFObjectHandle cur_pages,
44 } 78 }
45 } 79 }
46 80
47 -// FIXXX here down  
48 -  
49 void 81 void
50 -QPDF::clearPagesCache() 82 +QPDF::updateAllPagesCache()
51 { 83 {
  84 + // Force regeneration of the pages cache. We force immediate
  85 + // recalculation of all_pages since users may have references to
  86 + // it that they got from calls to getAllPages(). We can defer
  87 + // recalculation of pageobj_to_pages_pos until needed.
  88 + QTC::TC("qpdf", "QPDF updateAllPagesCache");
52 this->all_pages.clear(); 89 this->all_pages.clear();
53 this->pageobj_to_pages_pos.clear(); 90 this->pageobj_to_pages_pos.clear();
  91 + getAllPages();
54 } 92 }
55 93
56 void 94 void
57 QPDF::flattenPagesTree() 95 QPDF::flattenPagesTree()
58 { 96 {
59 - clearPagesCache(); 97 + // If not already done, flatten the /Pages structure and
  98 + // initialize pageobj_to_pages_pos.
60 99
61 - // FIXME: more specific method, we don't want to generate the extra stuff.  
62 - // We also need cheap fixup after addPage/removePage. 100 + if (! this->pageobj_to_pages_pos.empty())
  101 + {
  102 + return;
  103 + }
63 104
64 - // no compressed objects to be produced here...  
65 - std::map<int, int> object_stream_data;  
66 - optimize(object_stream_data); // push down inheritance 105 + // Push inherited objects down to the /Page level
  106 + optimizePagesTree(true);
  107 + getAllPages();
67 108
68 - std::vector<QPDFObjectHandle> kids = this->getAllPages();  
69 QPDFObjectHandle pages = this->trailer.getKey("/Root").getKey("/Pages"); 109 QPDFObjectHandle pages = this->trailer.getKey("/Root").getKey("/Pages");
70 110
71 - const int len = kids.size(); 111 + int const len = (int)this->all_pages.size();
72 for (int pos = 0; pos < len; ++pos) 112 for (int pos = 0; pos < len; ++pos)
73 { 113 {
74 - // populate pageobj_to_pages_pos  
75 - ObjGen og(kids[pos].getObjectID(), kids[pos].getGeneration());  
76 - if (! this->pageobj_to_pages_pos.insert(std::make_pair(og, pos)).second)  
77 - {  
78 - // insert failed: duplicate entry found  
79 - *out_stream << "WARNING: duplicate page reference found, "  
80 - << "but currently not fully supported." << std::endl;  
81 - }  
82 -  
83 - // fix parent links  
84 - kids[pos].replaceKey("/Parent", pages); 114 + // populate pageobj_to_pages_pos and fix parent pointer
  115 + insertPageobjToPage(this->all_pages[pos], pos, true);
  116 + this->all_pages[pos].replaceKey("/Parent", pages);
85 } 117 }
86 118
87 - pages.replaceKey("/Kids", QPDFObjectHandle::newArray(kids)); 119 + pages.replaceKey("/Kids", QPDFObjectHandle::newArray(this->all_pages));
88 // /Count has not changed 120 // /Count has not changed
89 assert(pages.getKey("/Count").getIntValue() == len); 121 assert(pages.getKey("/Count").getIntValue() == len);
90 } 122 }
91 123
92 -int  
93 -QPDF::findPage(int objid, int generation)  
94 -{  
95 - if (this->pageobj_to_pages_pos.empty())  
96 - {  
97 - flattenPagesTree();  
98 - }  
99 - std::map<ObjGen, int>::iterator it =  
100 - this->pageobj_to_pages_pos.find(ObjGen(objid, generation));  
101 - if (it != this->pageobj_to_pages_pos.end())  
102 - {  
103 - return (*it).second;  
104 - }  
105 - return -1; // throw?  
106 -}  
107 -  
108 -int  
109 -QPDF::findPage(QPDFObjectHandle const& pageoh) 124 +void
  125 +QPDF::insertPageobjToPage(QPDFObjectHandle const& obj, int pos,
  126 + bool check_duplicate)
110 { 127 {
111 - if (!pageoh.isInitialized()) 128 + ObjGen og(obj.getObjectID(), obj.getGeneration());
  129 + bool duplicate =
  130 + (! this->pageobj_to_pages_pos.insert(std::make_pair(og, pos)).second);
  131 + if (duplicate && check_duplicate)
112 { 132 {
113 - return -1;  
114 - // TODO? throw 133 + QTC::TC("qpdf", "QPDF duplicate page reference");
  134 + setLastObjectDescription("page " + QUtil::int_to_string(pos) +
  135 + " (numbered from zero)",
  136 + og.obj, og.gen);
  137 + throw QPDFExc(qpdf_e_pages, this->file->getName(),
  138 + this->last_object_description, 0,
  139 + "duplicate page reference found;"
  140 + " this would cause loss of data");
115 } 141 }
116 - return findPage(pageoh.getObjectID(), pageoh.getGeneration());  
117 } 142 }
118 143
119 void 144 void
120 -QPDF::addPage(QPDFObjectHandle newpage, bool first) 145 +QPDF::insertPage(QPDFObjectHandle newpage, int pos)
121 { 146 {
122 - if (this->pageobj_to_pages_pos.empty())  
123 - {  
124 - flattenPagesTree();  
125 - } 147 + // pos is numbered from 0, so pos = 0 inserts at the begining and
  148 + // pos = npages adds to the end.
126 149
127 - newpage.assertPageObject(); // FIXME: currently private 150 + flattenPagesTree();
  151 + newpage.assertPageObject();
  152 +
  153 + QTC::TC("qpdf", "QPDF insert page",
  154 + (pos == 0) ? 0 : // insert at beginning
  155 + (pos == ((int)this->all_pages.size())) ? 1 : // insert at end
  156 + 2); // insert in middle
128 157
129 QPDFObjectHandle pages = this->trailer.getKey("/Root").getKey("/Pages"); 158 QPDFObjectHandle pages = this->trailer.getKey("/Root").getKey("/Pages");
130 QPDFObjectHandle kids = pages.getKey("/Kids"); 159 QPDFObjectHandle kids = pages.getKey("/Kids");
  160 + assert ((pos >= 0) && (pos <= (int)this->all_pages.size()));
131 161
132 newpage.replaceKey("/Parent", pages); 162 newpage.replaceKey("/Parent", pages);
133 - if (first)  
134 - {  
135 - kids.insertItem(0, newpage);  
136 - }  
137 - else 163 + kids.insertItem(pos, newpage);
  164 + int npages = kids.getArrayNItems();
  165 + pages.replaceKey("/Count", QPDFObjectHandle::newInteger(npages));
  166 + this->all_pages.insert(this->all_pages.begin() + pos, newpage);
  167 + assert((int)this->all_pages.size() == npages);
  168 + for (int i = pos + 1; i < npages; ++i)
138 { 169 {
139 - kids.appendItem(newpage); 170 + insertPageobjToPage(this->all_pages[i], i, false);
140 } 171 }
141 - pages.replaceKey("/Count",  
142 - QPDFObjectHandle::newInteger(kids.getArrayNItems()));  
143 -  
144 - // FIXME: this is overkill, but cache is now stale  
145 - clearPagesCache(); 172 + insertPageobjToPage(newpage, pos, true);
  173 + assert((int)this->pageobj_to_pages_pos.size() == npages);
146 } 174 }
147 175
148 void 176 void
149 -QPDF::addPageAt(QPDFObjectHandle newpage, bool before,  
150 - QPDFObjectHandle const &refpage) 177 +QPDF::removePage(QPDFObjectHandle page)
151 { 178 {
152 - int refpos = findPage(refpage); // also ensures flat /Pages  
153 - if (refpos == -1)  
154 - {  
155 - throw "Could not find refpage";  
156 - }  
157 -  
158 - newpage.assertPageObject(); 179 + int pos = findPage(page); // also ensures flat /Pages
  180 + QTC::TC("qpdf", "QPDF remove page",
  181 + (pos == 0) ? 0 : // remove at beginning
  182 + (pos == ((int)this->all_pages.size() - 1)) ? 1 : // remove at end
  183 + 2); // remove in middle
159 184
160 QPDFObjectHandle pages = this->trailer.getKey("/Root").getKey("/Pages"); 185 QPDFObjectHandle pages = this->trailer.getKey("/Root").getKey("/Pages");
161 QPDFObjectHandle kids = pages.getKey("/Kids"); 186 QPDFObjectHandle kids = pages.getKey("/Kids");
162 187
  188 + kids.eraseItem(pos);
  189 + int npages = kids.getArrayNItems();
  190 + pages.replaceKey("/Count", QPDFObjectHandle::newInteger(npages));
  191 + this->all_pages.erase(this->all_pages.begin() + pos);
  192 + assert((int)this->all_pages.size() == npages);
  193 + this->pageobj_to_pages_pos.erase(
  194 + ObjGen(page.getObjectID(), page.getGeneration()));
  195 + assert((int)this->pageobj_to_pages_pos.size() == npages);
  196 + for (int i = pos; i < npages; ++i)
  197 + {
  198 + insertPageobjToPage(this->all_pages[i], i, false);
  199 + }
  200 +}
  201 +
  202 +void
  203 +QPDF::addPageAt(QPDFObjectHandle newpage, bool before,
  204 + QPDFObjectHandle refpage)
  205 +{
  206 + int refpos = findPage(refpage);
163 if (! before) 207 if (! before)
164 { 208 {
165 ++refpos; 209 ++refpos;
166 } 210 }
167 -  
168 - newpage.replaceKey("/Parent", pages);  
169 - kids.insertItem(refpos, newpage);  
170 - pages.replaceKey("/Count",  
171 - QPDFObjectHandle::newInteger(kids.getArrayNItems()));  
172 -  
173 - // FIXME: this is overkill, but cache is now stale  
174 - clearPagesCache(); 211 + insertPage(newpage, refpos);
175 } 212 }
176 213
  214 +
177 void 215 void
178 -QPDF::removePage(QPDFObjectHandle const& pageoh) 216 +QPDF::addPage(QPDFObjectHandle newpage, bool first)
179 { 217 {
180 - int pos = findPage(pageoh); // also ensures flat /Pages  
181 - if (pos == -1) 218 + getAllPages();
  219 + if (first)
182 { 220 {
183 - throw "Can't remove non-existing page"; 221 + insertPage(newpage, 0);
184 } 222 }
  223 + else
  224 + {
  225 + insertPage(newpage, (int)this->all_pages.size());
  226 + }
  227 +}
185 228
186 - QPDFObjectHandle pages = this->trailer.getKey("/Root").getKey("/Pages");  
187 - QPDFObjectHandle kids = pages.getKey("/Kids");  
188 -  
189 - kids.eraseItem(pos);  
190 - pages.replaceKey("/Count",  
191 - QPDFObjectHandle::newInteger(kids.getArrayNItems())); 229 +int
  230 +QPDF::findPage(QPDFObjectHandle& page)
  231 +{
  232 + page.assertPageObject();
  233 + return findPage(page.getObjectID(), page.getGeneration());
  234 +}
192 235
193 - // FIXME: this is overkill, but cache is now stale  
194 - clearPagesCache(); 236 +int
  237 +QPDF::findPage(int objid, int generation)
  238 +{
  239 + flattenPagesTree();
  240 + std::map<ObjGen, int>::iterator it =
  241 + this->pageobj_to_pages_pos.find(ObjGen(objid, generation));
  242 + if (it == this->pageobj_to_pages_pos.end())
  243 + {
  244 + setLastObjectDescription("page object", objid, generation);
  245 + QPDFExc(qpdf_e_pages, this->file->getName(),
  246 + this->last_object_description, 0,
  247 + "page object not referenced in /Pages tree");
  248 + }
  249 + return (*it).second;
195 } 250 }
qpdf/qpdf.testcov
@@ -203,3 +203,7 @@ qpdf-c called qpdf_init_write_memory 0 @@ -203,3 +203,7 @@ qpdf-c called qpdf_init_write_memory 0
203 exercise processFile(name) 0 203 exercise processFile(name) 0
204 exercise processFile(FILE*) 0 204 exercise processFile(FILE*) 0
205 exercise processMemoryFile 0 205 exercise processMemoryFile 0
  206 +QPDF duplicate page reference 0
  207 +QPDF remove page 2
  208 +QPDF insert page 2
  209 +QPDF updateAllPagesCache 0