Commit b8bdef0ad12883d72ced5eb443e6e34a93bbbb91
1 parent
607c3921
Implement deterministic ID
For non-encrypted files, determinstic ID generation uses file contents instead of timestamp and file name. At a small runtime cost, this enables generation of the same /ID if the same inputs are converted in the same way multiple times.
Showing
20 changed files
with
2232 additions
and
51 deletions
ChangeLog
TODO
| ... | ... | @@ -46,6 +46,14 @@ Small, command-line tool only enhancements to do soon |
| 46 | 46 | (libtool). |
| 47 | 47 | |
| 48 | 48 | |
| 49 | +Next ABI change | |
| 50 | +=============== | |
| 51 | + | |
| 52 | +Remove private methods that are there only for ABI compatibility | |
| 53 | +including extra QPDFWriter writeTrailer, writeXRefTable, | |
| 54 | +writeXRefStream. | |
| 55 | + | |
| 56 | + | |
| 49 | 57 | 5.2.0 |
| 50 | 58 | ===== |
| 51 | 59 | ... | ... |
include/qpdf/QPDFWriter.hh
| ... | ... | @@ -35,6 +35,7 @@ |
| 35 | 35 | class QPDF; |
| 36 | 36 | class QPDFObjectHandle; |
| 37 | 37 | class Pl_Count; |
| 38 | +class Pl_MD5; | |
| 38 | 39 | |
| 39 | 40 | class QPDFWriter |
| 40 | 41 | { |
| ... | ... | @@ -189,8 +190,22 @@ class QPDFWriter |
| 189 | 190 | QPDF_DLL |
| 190 | 191 | void setExtraHeaderText(std::string const&); |
| 191 | 192 | |
| 193 | + // Causes a deterministic /ID value to be generated. When this is | |
| 194 | + // set, the current time and output file name are not used as part | |
| 195 | + // of /ID generation. Instead, a digest of all significant parts | |
| 196 | + // of the output file's contents is included in the /ID | |
| 197 | + // calculation. Use of a deterministic /ID can be handy when it is | |
| 198 | + // desirable for a repeat of the same qpdf operation on the same | |
| 199 | + // inputs being written to the same outputs with the same | |
| 200 | + // parameters to generate exactly the same results. This feature | |
| 201 | + // is incompatible with encrypted files because, for encrypted | |
| 202 | + // files, the /ID is generated before any part of the file is | |
| 203 | + // written since it is an input to the encryption process. | |
| 204 | + QPDF_DLL | |
| 205 | + void setDeterministicID(bool); | |
| 206 | + | |
| 192 | 207 | // Cause a static /ID value to be generated. Use only in test |
| 193 | - // suites. | |
| 208 | + // suites. See also setDeterministicID. | |
| 194 | 209 | QPDF_DLL |
| 195 | 210 | void setStaticID(bool); |
| 196 | 211 | |
| ... | ... | @@ -298,6 +313,9 @@ class QPDFWriter |
| 298 | 313 | void writeObject(QPDFObjectHandle object, int object_stream_index = -1); |
| 299 | 314 | void writeTrailer(trailer_e which, int size, |
| 300 | 315 | bool xref_stream, qpdf_offset_t prev = 0); |
| 316 | + void writeTrailer(trailer_e which, int size, | |
| 317 | + bool xref_stream, qpdf_offset_t prev, | |
| 318 | + int linearization_pass); | |
| 301 | 319 | void unparseObject(QPDFObjectHandle object, int level, |
| 302 | 320 | unsigned int flags); |
| 303 | 321 | void unparseObject(QPDFObjectHandle object, int level, |
| ... | ... | @@ -348,6 +366,15 @@ class QPDFWriter |
| 348 | 366 | int hint_id, |
| 349 | 367 | qpdf_offset_t hint_offset, |
| 350 | 368 | qpdf_offset_t hint_length); |
| 369 | + qpdf_offset_t writeXRefTable( | |
| 370 | + trailer_e which, int first, int last, int size, | |
| 371 | + // for linearization | |
| 372 | + qpdf_offset_t prev, | |
| 373 | + bool suppress_offsets, | |
| 374 | + int hint_id, | |
| 375 | + qpdf_offset_t hint_offset, | |
| 376 | + qpdf_offset_t hint_length, | |
| 377 | + int linearization_pass); | |
| 351 | 378 | qpdf_offset_t writeXRefStream( |
| 352 | 379 | int objid, int max_id, qpdf_offset_t max_offset, |
| 353 | 380 | trailer_e which, int first, int last, int size); |
| ... | ... | @@ -360,6 +387,16 @@ class QPDFWriter |
| 360 | 387 | qpdf_offset_t hint_offset, |
| 361 | 388 | qpdf_offset_t hint_length, |
| 362 | 389 | bool skip_compression); |
| 390 | + qpdf_offset_t writeXRefStream( | |
| 391 | + int objid, int max_id, qpdf_offset_t max_offset, | |
| 392 | + trailer_e which, int first, int last, int size, | |
| 393 | + // for linearization | |
| 394 | + qpdf_offset_t prev, | |
| 395 | + int hint_id, | |
| 396 | + qpdf_offset_t hint_offset, | |
| 397 | + qpdf_offset_t hint_length, | |
| 398 | + bool skip_compression, | |
| 399 | + int linearization_pass); | |
| 363 | 400 | int calculateXrefStreamPadding(int xref_bytes); |
| 364 | 401 | |
| 365 | 402 | // When filtering subsections, push additional pipelines to the |
| ... | ... | @@ -380,6 +417,8 @@ class QPDFWriter |
| 380 | 417 | void adjustAESStreamLength(size_t& length); |
| 381 | 418 | void pushEncryptionFilter(); |
| 382 | 419 | void pushDiscardFilter(); |
| 420 | + void pushMD5Pipeline(); | |
| 421 | + void computeDeterministicIDData(); | |
| 383 | 422 | |
| 384 | 423 | void discardGeneration(std::map<QPDFObjGen, int> const& in, |
| 385 | 424 | std::map<int, int>& out); |
| ... | ... | @@ -437,6 +476,9 @@ class QPDFWriter |
| 437 | 476 | std::map<QPDFObjGen, int> object_to_object_stream; |
| 438 | 477 | std::map<int, std::set<QPDFObjGen> > object_stream_to_objects; |
| 439 | 478 | std::list<Pipeline*> pipeline_stack; |
| 479 | + bool deterministic_id; | |
| 480 | + Pl_MD5* md5_pipeline; | |
| 481 | + std::string deterministic_id_data; | |
| 440 | 482 | |
| 441 | 483 | // For linearization only |
| 442 | 484 | std::map<int, int> obj_renumber_no_gen; | ... | ... |
include/qpdf/qpdf-c.h
| ... | ... | @@ -324,8 +324,11 @@ extern "C" { |
| 324 | 324 | QPDF_DLL |
| 325 | 325 | void qpdf_set_qdf_mode(qpdf_data qpdf, QPDF_BOOL value); |
| 326 | 326 | |
| 327 | + QPDF_DLL | |
| 328 | + void qpdf_set_deterministic_ID(qpdf_data qpdf, QPDF_BOOL value); | |
| 329 | + | |
| 327 | 330 | /* Never use qpdf_set_static_ID except in test suites to suppress |
| 328 | - * generation of a random /ID. | |
| 331 | + * generation of a random /ID. See also qpdf_set_deterministic_ID. | |
| 329 | 332 | */ |
| 330 | 333 | QPDF_DLL |
| 331 | 334 | void qpdf_set_static_ID(qpdf_data qpdf, QPDF_BOOL value); | ... | ... |
libqpdf/Pl_MD5.cc
| ... | ... | @@ -3,7 +3,9 @@ |
| 3 | 3 | |
| 4 | 4 | Pl_MD5::Pl_MD5(char const* identifier, Pipeline* next) : |
| 5 | 5 | Pipeline(identifier, next), |
| 6 | - in_progress(false) | |
| 6 | + in_progress(false), | |
| 7 | + enabled(true), | |
| 8 | + persist_across_finish(false) | |
| 7 | 9 | { |
| 8 | 10 | } |
| 9 | 11 | |
| ... | ... | @@ -14,24 +16,27 @@ Pl_MD5::~Pl_MD5() |
| 14 | 16 | void |
| 15 | 17 | Pl_MD5::write(unsigned char* buf, size_t len) |
| 16 | 18 | { |
| 17 | - if (! this->in_progress) | |
| 19 | + if (this->enabled) | |
| 18 | 20 | { |
| 19 | - this->md5.reset(); | |
| 20 | - this->in_progress = true; | |
| 21 | - } | |
| 21 | + if (! this->in_progress) | |
| 22 | + { | |
| 23 | + this->md5.reset(); | |
| 24 | + this->in_progress = true; | |
| 25 | + } | |
| 22 | 26 | |
| 23 | - // Write in chunks in case len is too big to fit in an int. | |
| 24 | - // Assume int is at least 32 bits. | |
| 25 | - static size_t const max_bytes = 1 << 30; | |
| 26 | - size_t bytes_left = len; | |
| 27 | - unsigned char* data = buf; | |
| 28 | - while (bytes_left > 0) | |
| 29 | - { | |
| 30 | - size_t bytes = (bytes_left >= max_bytes ? max_bytes : bytes_left); | |
| 31 | - this->md5.encodeDataIncrementally( | |
| 32 | - reinterpret_cast<char*>(data), bytes); | |
| 33 | - bytes_left -= bytes; | |
| 34 | - data += bytes; | |
| 27 | + // Write in chunks in case len is too big to fit in an int. | |
| 28 | + // Assume int is at least 32 bits. | |
| 29 | + static size_t const max_bytes = 1 << 30; | |
| 30 | + size_t bytes_left = len; | |
| 31 | + unsigned char* data = buf; | |
| 32 | + while (bytes_left > 0) | |
| 33 | + { | |
| 34 | + size_t bytes = (bytes_left >= max_bytes ? max_bytes : bytes_left); | |
| 35 | + this->md5.encodeDataIncrementally( | |
| 36 | + reinterpret_cast<char*>(data), bytes); | |
| 37 | + bytes_left -= bytes; | |
| 38 | + data += bytes; | |
| 39 | + } | |
| 35 | 40 | } |
| 36 | 41 | |
| 37 | 42 | this->getNext()->write(buf, len); |
| ... | ... | @@ -41,16 +46,32 @@ void |
| 41 | 46 | Pl_MD5::finish() |
| 42 | 47 | { |
| 43 | 48 | this->getNext()->finish(); |
| 44 | - this->in_progress = false; | |
| 49 | + if (! this->persist_across_finish) | |
| 50 | + { | |
| 51 | + this->in_progress = false; | |
| 52 | + } | |
| 53 | +} | |
| 54 | + | |
| 55 | +void | |
| 56 | +Pl_MD5::enable(bool enabled) | |
| 57 | +{ | |
| 58 | + this->enabled = enabled; | |
| 59 | +} | |
| 60 | + | |
| 61 | +void | |
| 62 | +Pl_MD5::persistAcrossFinish(bool persist) | |
| 63 | +{ | |
| 64 | + this->persist_across_finish = persist; | |
| 45 | 65 | } |
| 46 | 66 | |
| 47 | 67 | std::string |
| 48 | 68 | Pl_MD5::getHexDigest() |
| 49 | 69 | { |
| 50 | - if (this->in_progress) | |
| 70 | + if (! this->enabled) | |
| 51 | 71 | { |
| 52 | 72 | throw std::logic_error( |
| 53 | - "digest requested for in-progress MD5 Pipeline"); | |
| 73 | + "digest requested for a disabled MD5 Pipeline"); | |
| 54 | 74 | } |
| 75 | + this->in_progress = false; | |
| 55 | 76 | return this->md5.unparse(); |
| 56 | 77 | } | ... | ... |
libqpdf/QPDFWriter.cc
| ... | ... | @@ -9,6 +9,7 @@ |
| 9 | 9 | #include <qpdf/Pl_AES_PDF.hh> |
| 10 | 10 | #include <qpdf/Pl_Flate.hh> |
| 11 | 11 | #include <qpdf/Pl_PNGFilter.hh> |
| 12 | +#include <qpdf/Pl_MD5.hh> | |
| 12 | 13 | #include <qpdf/QUtil.hh> |
| 13 | 14 | #include <qpdf/MD5.hh> |
| 14 | 15 | #include <qpdf/RC4.hh> |
| ... | ... | @@ -77,6 +78,8 @@ QPDFWriter::init() |
| 77 | 78 | cur_stream_length = 0; |
| 78 | 79 | added_newline = false; |
| 79 | 80 | max_ostream_index = 0; |
| 81 | + deterministic_id = false; | |
| 82 | + md5_pipeline = 0; | |
| 80 | 83 | } |
| 81 | 84 | |
| 82 | 85 | QPDFWriter::~QPDFWriter() |
| ... | ... | @@ -264,6 +267,12 @@ QPDFWriter::setStaticID(bool val) |
| 264 | 267 | } |
| 265 | 268 | |
| 266 | 269 | void |
| 270 | +QPDFWriter::setDeterministicID(bool val) | |
| 271 | +{ | |
| 272 | + this->deterministic_id = val; | |
| 273 | +} | |
| 274 | + | |
| 275 | +void | |
| 267 | 276 | QPDFWriter::setStaticAesIV(bool val) |
| 268 | 277 | { |
| 269 | 278 | if (val) |
| ... | ... | @@ -507,10 +516,10 @@ void |
| 507 | 516 | QPDFWriter::copyEncryptionParameters(QPDF& qpdf) |
| 508 | 517 | { |
| 509 | 518 | this->preserve_encryption = false; |
| 510 | - generateID(); | |
| 511 | 519 | QPDFObjectHandle trailer = qpdf.getTrailer(); |
| 512 | 520 | if (trailer.hasKey("/Encrypt")) |
| 513 | 521 | { |
| 522 | + generateID(); | |
| 514 | 523 | this->id1 = |
| 515 | 524 | trailer.getKey("/ID").getArrayItem(0).getStringValue(); |
| 516 | 525 | QPDFObjectHandle encrypt = trailer.getKey("/Encrypt"); |
| ... | ... | @@ -864,6 +873,10 @@ QPDFWriter::popPipelineStack(PointerHolder<Buffer>* bp) |
| 864 | 873 | while (dynamic_cast<Pl_Count*>(this->pipeline_stack.back()) == 0) |
| 865 | 874 | { |
| 866 | 875 | Pipeline* p = this->pipeline_stack.back(); |
| 876 | + if (dynamic_cast<Pl_MD5*>(p) == this->md5_pipeline) | |
| 877 | + { | |
| 878 | + this->md5_pipeline = 0; | |
| 879 | + } | |
| 867 | 880 | this->pipeline_stack.pop_back(); |
| 868 | 881 | Pl_Buffer* buf = dynamic_cast<Pl_Buffer*>(p); |
| 869 | 882 | if (bp && buf) |
| ... | ... | @@ -921,6 +934,36 @@ QPDFWriter::pushDiscardFilter() |
| 921 | 934 | activatePipelineStack(); |
| 922 | 935 | } |
| 923 | 936 | |
| 937 | +void | |
| 938 | +QPDFWriter::pushMD5Pipeline() | |
| 939 | +{ | |
| 940 | + if (! this->id2.empty()) | |
| 941 | + { | |
| 942 | + // Can't happen in the code | |
| 943 | + throw std::logic_error( | |
| 944 | + "Deterministic ID computation enabled after ID" | |
| 945 | + " generation has already occurred."); | |
| 946 | + } | |
| 947 | + assert(this->deterministic_id); | |
| 948 | + assert(this->md5_pipeline == 0); | |
| 949 | + assert(this->pipeline->getCount() == 0); | |
| 950 | + this->md5_pipeline = new Pl_MD5("qpdf md5", this->pipeline); | |
| 951 | + this->md5_pipeline->persistAcrossFinish(true); | |
| 952 | + // Special case code in popPipelineStack clears this->md5_pipeline | |
| 953 | + // upon deletion. | |
| 954 | + pushPipeline(this->md5_pipeline); | |
| 955 | + activatePipelineStack(); | |
| 956 | +} | |
| 957 | + | |
| 958 | +void | |
| 959 | +QPDFWriter::computeDeterministicIDData() | |
| 960 | +{ | |
| 961 | + assert(this->md5_pipeline != 0); | |
| 962 | + assert(this->deterministic_id_data.empty()); | |
| 963 | + this->deterministic_id_data = this->md5_pipeline->getHexDigest(); | |
| 964 | + this->md5_pipeline->enable(false); | |
| 965 | +} | |
| 966 | + | |
| 924 | 967 | int |
| 925 | 968 | QPDFWriter::openObject(int objid) |
| 926 | 969 | { |
| ... | ... | @@ -1069,6 +1112,13 @@ void |
| 1069 | 1112 | QPDFWriter::writeTrailer(trailer_e which, int size, bool xref_stream, |
| 1070 | 1113 | qpdf_offset_t prev) |
| 1071 | 1114 | { |
| 1115 | + writeTrailer(which, size, xref_stream, prev, 0); | |
| 1116 | +} | |
| 1117 | + | |
| 1118 | +void | |
| 1119 | +QPDFWriter::writeTrailer(trailer_e which, int size, bool xref_stream, | |
| 1120 | + qpdf_offset_t prev, int linearization_pass) | |
| 1121 | +{ | |
| 1072 | 1122 | QPDFObjectHandle trailer = getTrimmedTrailer(); |
| 1073 | 1123 | if (! xref_stream) |
| 1074 | 1124 | { |
| ... | ... | @@ -1119,8 +1169,21 @@ QPDFWriter::writeTrailer(trailer_e which, int size, bool xref_stream, |
| 1119 | 1169 | // Write ID |
| 1120 | 1170 | writeStringQDF(" "); |
| 1121 | 1171 | writeString(" /ID ["); |
| 1122 | - writeString(QPDF_String(this->id1).unparse(true)); | |
| 1123 | - writeString(QPDF_String(this->id2).unparse(true)); | |
| 1172 | + if (linearization_pass == 1) | |
| 1173 | + { | |
| 1174 | + writeString("<00000000000000000000000000000000>" | |
| 1175 | + "<00000000000000000000000000000000>"); | |
| 1176 | + } | |
| 1177 | + else | |
| 1178 | + { | |
| 1179 | + if ((linearization_pass == 0) && (this->deterministic_id)) | |
| 1180 | + { | |
| 1181 | + computeDeterministicIDData(); | |
| 1182 | + } | |
| 1183 | + generateID(); | |
| 1184 | + writeString(QPDF_String(this->id1).unparse(true)); | |
| 1185 | + writeString(QPDF_String(this->id2).unparse(true)); | |
| 1186 | + } | |
| 1124 | 1187 | writeString("]"); |
| 1125 | 1188 | |
| 1126 | 1189 | if (which != t_lin_second) |
| ... | ... | @@ -1794,12 +1857,8 @@ QPDFWriter::writeObject(QPDFObjectHandle object, int object_stream_index) |
| 1794 | 1857 | void |
| 1795 | 1858 | QPDFWriter::generateID() |
| 1796 | 1859 | { |
| 1797 | - // Note: we can't call generateID() at the time of construction | |
| 1798 | - // since the caller hasn't yet had a chance to call setStaticID(), | |
| 1799 | - // but we need to generate it before computing encryption | |
| 1800 | - // dictionary parameters. This is why we call this function both | |
| 1801 | - // from setEncryptionParameters() and from write() and return | |
| 1802 | - // immediately if the ID has already been generated. | |
| 1860 | + // Generate the ID lazily so that we can handle the user's | |
| 1861 | + // preference to use static or deterministic ID generation. | |
| 1803 | 1862 | |
| 1804 | 1863 | if (! this->id2.empty()) |
| 1805 | 1864 | { |
| ... | ... | @@ -1822,17 +1881,40 @@ QPDFWriter::generateID() |
| 1822 | 1881 | } |
| 1823 | 1882 | else |
| 1824 | 1883 | { |
| 1825 | - // The PDF specification has guidelines for creating IDs, but it | |
| 1826 | - // states clearly that the only thing that's really important is | |
| 1827 | - // that it is very likely to be unique. We can't really follow | |
| 1828 | - // the guidelines in the spec exactly because we haven't written | |
| 1829 | - // the file yet. This scheme should be fine though. | |
| 1884 | + // The PDF specification has guidelines for creating IDs, but | |
| 1885 | + // it states clearly that the only thing that's really | |
| 1886 | + // important is that it is very likely to be unique. We can't | |
| 1887 | + // really follow the guidelines in the spec exactly because we | |
| 1888 | + // haven't written the file yet. This scheme should be fine | |
| 1889 | + // though. The deterministic ID case uses a digest of a | |
| 1890 | + // sufficient portion of the file's contents such no two | |
| 1891 | + // non-matching files would match in the subsets used for this | |
| 1892 | + // computation. Note that we explicitly omit the filename from | |
| 1893 | + // the digest calculation for deterministic ID so that the same | |
| 1894 | + // file converted with qpdf, in that case, would have the same | |
| 1895 | + // ID regardless of the output file's name. | |
| 1830 | 1896 | |
| 1831 | 1897 | std::string seed; |
| 1832 | - seed += QUtil::int_to_string(QUtil::get_current_time()); | |
| 1898 | + if (this->deterministic_id) | |
| 1899 | + { | |
| 1900 | + if (this->deterministic_id_data.empty()) | |
| 1901 | + { | |
| 1902 | + QTC::TC("qpdf", "QPDFWriter deterministic with no data"); | |
| 1903 | + throw std::logic_error( | |
| 1904 | + "INTERNAL ERROR: QPDFWriter::generateID has no" | |
| 1905 | + " data for deterministic ID. This may happen if" | |
| 1906 | + " deterministic ID and file encryption are requested" | |
| 1907 | + " together."); | |
| 1908 | + } | |
| 1909 | + seed += this->deterministic_id_data; | |
| 1910 | + } | |
| 1911 | + else | |
| 1912 | + { | |
| 1913 | + seed += QUtil::int_to_string(QUtil::get_current_time()); | |
| 1914 | + seed += this->filename; | |
| 1915 | + seed += " "; | |
| 1916 | + } | |
| 1833 | 1917 | seed += " QPDF "; |
| 1834 | - seed += this->filename; | |
| 1835 | - seed += " "; | |
| 1836 | 1918 | if (trailer.hasKey("/Info")) |
| 1837 | 1919 | { |
| 1838 | 1920 | QPDFObjectHandle info = trailer.getKey("/Info"); |
| ... | ... | @@ -2260,8 +2342,6 @@ QPDFWriter::write() |
| 2260 | 2342 | setMinimumPDFVersion("1.5"); |
| 2261 | 2343 | } |
| 2262 | 2344 | |
| 2263 | - generateID(); | |
| 2264 | - | |
| 2265 | 2345 | prepareFileForWrite(); |
| 2266 | 2346 | |
| 2267 | 2347 | if (this->linearized) |
| ... | ... | @@ -2397,6 +2477,17 @@ QPDFWriter::writeXRefTable(trailer_e which, int first, int last, int size, |
| 2397 | 2477 | int hint_id, qpdf_offset_t hint_offset, |
| 2398 | 2478 | qpdf_offset_t hint_length) |
| 2399 | 2479 | { |
| 2480 | + // ABI compatibility | |
| 2481 | + return writeXRefTable(which, first, last, size, prev, suppress_offsets, | |
| 2482 | + hint_id, hint_offset, hint_length, 0); | |
| 2483 | +} | |
| 2484 | + | |
| 2485 | +qpdf_offset_t | |
| 2486 | +QPDFWriter::writeXRefTable(trailer_e which, int first, int last, int size, | |
| 2487 | + qpdf_offset_t prev, bool suppress_offsets, | |
| 2488 | + int hint_id, qpdf_offset_t hint_offset, | |
| 2489 | + qpdf_offset_t hint_length, int linearization_pass) | |
| 2490 | +{ | |
| 2400 | 2491 | writeString("xref\n"); |
| 2401 | 2492 | writeString(QUtil::int_to_string(first)); |
| 2402 | 2493 | writeString(" "); |
| ... | ... | @@ -2426,7 +2517,7 @@ QPDFWriter::writeXRefTable(trailer_e which, int first, int last, int size, |
| 2426 | 2517 | writeString(" 00000 n \n"); |
| 2427 | 2518 | } |
| 2428 | 2519 | } |
| 2429 | - writeTrailer(which, size, false, prev); | |
| 2520 | + writeTrailer(which, size, false, prev, linearization_pass); | |
| 2430 | 2521 | writeString("\n"); |
| 2431 | 2522 | return space_before_zero; |
| 2432 | 2523 | } |
| ... | ... | @@ -2435,8 +2526,9 @@ qpdf_offset_t |
| 2435 | 2526 | QPDFWriter::writeXRefStream(int objid, int max_id, qpdf_offset_t max_offset, |
| 2436 | 2527 | trailer_e which, int first, int last, int size) |
| 2437 | 2528 | { |
| 2529 | + // ABI compatibility | |
| 2438 | 2530 | return writeXRefStream(objid, max_id, max_offset, |
| 2439 | - which, first, last, size, 0, 0, 0, 0, false); | |
| 2531 | + which, first, last, size, 0, 0, 0, 0, false, 0); | |
| 2440 | 2532 | } |
| 2441 | 2533 | |
| 2442 | 2534 | qpdf_offset_t |
| ... | ... | @@ -2445,7 +2537,8 @@ QPDFWriter::writeXRefStream(int xref_id, int max_id, qpdf_offset_t max_offset, |
| 2445 | 2537 | qpdf_offset_t prev, int hint_id, |
| 2446 | 2538 | qpdf_offset_t hint_offset, |
| 2447 | 2539 | qpdf_offset_t hint_length, |
| 2448 | - bool skip_compression) | |
| 2540 | + bool skip_compression, | |
| 2541 | + int linearization_pass) | |
| 2449 | 2542 | { |
| 2450 | 2543 | qpdf_offset_t xref_offset = this->pipeline->getCount(); |
| 2451 | 2544 | qpdf_offset_t space_before_zero = xref_offset - 1; |
| ... | ... | @@ -2545,7 +2638,7 @@ QPDFWriter::writeXRefStream(int xref_id, int max_id, qpdf_offset_t max_offset, |
| 2545 | 2638 | QUtil::int_to_string(first) + " " + |
| 2546 | 2639 | QUtil::int_to_string(last - first + 1) + " ]"); |
| 2547 | 2640 | } |
| 2548 | - writeTrailer(which, size, true, prev); | |
| 2641 | + writeTrailer(which, size, true, prev, linearization_pass); | |
| 2549 | 2642 | writeString("\nstream\n"); |
| 2550 | 2643 | writeBuffer(xref_data); |
| 2551 | 2644 | writeString("\nendstream"); |
| ... | ... | @@ -2725,6 +2818,10 @@ QPDFWriter::writeLinearized() |
| 2725 | 2818 | if (pass == 1) |
| 2726 | 2819 | { |
| 2727 | 2820 | pushDiscardFilter(); |
| 2821 | + if (this->deterministic_id) | |
| 2822 | + { | |
| 2823 | + pushMD5Pipeline(); | |
| 2824 | + } | |
| 2728 | 2825 | } |
| 2729 | 2826 | |
| 2730 | 2827 | // Part 1: header |
| ... | ... | @@ -2807,7 +2904,7 @@ QPDFWriter::writeLinearized() |
| 2807 | 2904 | first_trailer_size, |
| 2808 | 2905 | hint_length + second_xref_offset, |
| 2809 | 2906 | hint_id, hint_offset, hint_length, |
| 2810 | - (pass == 1)); | |
| 2907 | + (pass == 1), pass); | |
| 2811 | 2908 | qpdf_offset_t endpos = this->pipeline->getCount(); |
| 2812 | 2909 | if (pass == 1) |
| 2813 | 2910 | { |
| ... | ... | @@ -2834,7 +2931,8 @@ QPDFWriter::writeLinearized() |
| 2834 | 2931 | { |
| 2835 | 2932 | writeXRefTable(t_lin_first, first_half_start, first_half_end, |
| 2836 | 2933 | first_trailer_size, hint_length + second_xref_offset, |
| 2837 | - (pass == 1), hint_id, hint_offset, hint_length); | |
| 2934 | + (pass == 1), hint_id, hint_offset, hint_length, | |
| 2935 | + pass); | |
| 2838 | 2936 | writeString("startxref\n0\n%%EOF\n"); |
| 2839 | 2937 | } |
| 2840 | 2938 | |
| ... | ... | @@ -2886,7 +2984,7 @@ QPDFWriter::writeLinearized() |
| 2886 | 2984 | second_half_end, second_xref_offset, |
| 2887 | 2985 | t_lin_second, 0, second_half_end, |
| 2888 | 2986 | second_trailer_size, |
| 2889 | - 0, 0, 0, 0, (pass == 1)); | |
| 2987 | + 0, 0, 0, 0, (pass == 1), pass); | |
| 2890 | 2988 | qpdf_offset_t endpos = this->pipeline->getCount(); |
| 2891 | 2989 | |
| 2892 | 2990 | if (pass == 1) |
| ... | ... | @@ -2920,7 +3018,7 @@ QPDFWriter::writeLinearized() |
| 2920 | 3018 | { |
| 2921 | 3019 | space_before_zero = |
| 2922 | 3020 | writeXRefTable(t_lin_second, 0, second_half_end, |
| 2923 | - second_trailer_size); | |
| 3021 | + second_trailer_size, 0, false, 0, 0, 0, pass); | |
| 2924 | 3022 | } |
| 2925 | 3023 | writeString("startxref\n"); |
| 2926 | 3024 | writeString(QUtil::int_to_string(first_xref_offset)); |
| ... | ... | @@ -2930,6 +3028,15 @@ QPDFWriter::writeLinearized() |
| 2930 | 3028 | |
| 2931 | 3029 | if (pass == 1) |
| 2932 | 3030 | { |
| 3031 | + if (this->deterministic_id) | |
| 3032 | + { | |
| 3033 | + QTC::TC("qpdf", "QPDFWriter linearized deterministic ID", | |
| 3034 | + need_xref_stream ? 0 : 1); | |
| 3035 | + computeDeterministicIDData(); | |
| 3036 | + popPipelineStack(); | |
| 3037 | + assert(this->md5_pipeline == 0); | |
| 3038 | + } | |
| 3039 | + | |
| 2933 | 3040 | // Close first pass pipeline |
| 2934 | 3041 | file_size = this->pipeline->getCount(); |
| 2935 | 3042 | popPipelineStack(); |
| ... | ... | @@ -2954,6 +3061,11 @@ QPDFWriter::writeLinearized() |
| 2954 | 3061 | void |
| 2955 | 3062 | QPDFWriter::writeStandard() |
| 2956 | 3063 | { |
| 3064 | + if (this->deterministic_id) | |
| 3065 | + { | |
| 3066 | + pushMD5Pipeline(); | |
| 3067 | + } | |
| 3068 | + | |
| 2957 | 3069 | // Start writing |
| 2958 | 3070 | |
| 2959 | 3071 | writeHeader(); |
| ... | ... | @@ -3005,4 +3117,12 @@ QPDFWriter::writeStandard() |
| 3005 | 3117 | writeString("startxref\n"); |
| 3006 | 3118 | writeString(QUtil::int_to_string(xref_offset)); |
| 3007 | 3119 | writeString("\n%%EOF\n"); |
| 3120 | + | |
| 3121 | + if (this->deterministic_id) | |
| 3122 | + { | |
| 3123 | + QTC::TC("qpdf", "QPDFWriter standard deterministic ID", | |
| 3124 | + this->object_stream_to_objects.empty() ? 0 : 1); | |
| 3125 | + popPipelineStack(); | |
| 3126 | + assert(this->md5_pipeline == 0); | |
| 3127 | + } | |
| 3008 | 3128 | } | ... | ... |
libqpdf/qpdf-c.cc
| ... | ... | @@ -512,6 +512,12 @@ void qpdf_set_qdf_mode(qpdf_data qpdf, QPDF_BOOL value) |
| 512 | 512 | qpdf->qpdf_writer->setQDFMode(value); |
| 513 | 513 | } |
| 514 | 514 | |
| 515 | +void qpdf_set_deterministic_ID(qpdf_data qpdf, QPDF_BOOL value) | |
| 516 | +{ | |
| 517 | + QTC::TC("qpdf", "qpdf-c called qpdf_set_deterministic_ID"); | |
| 518 | + qpdf->qpdf_writer->setDeterministicID(value); | |
| 519 | +} | |
| 520 | + | |
| 515 | 521 | void qpdf_set_static_ID(qpdf_data qpdf, QPDF_BOOL value) |
| 516 | 522 | { |
| 517 | 523 | QTC::TC("qpdf", "qpdf-c called qpdf_set_static_ID"); | ... | ... |
libqpdf/qpdf/Pl_MD5.hh
| ... | ... | @@ -25,10 +25,24 @@ class Pl_MD5: public Pipeline |
| 25 | 25 | virtual void finish(); |
| 26 | 26 | QPDF_DLL |
| 27 | 27 | std::string getHexDigest(); |
| 28 | + // Enable/disable. Disabling the pipeline causes it to become a | |
| 29 | + // pass-through. This makes it possible to stick an MD5 pipeline | |
| 30 | + // in a pipeline when it may or may not be required. Disabling it | |
| 31 | + // avoids incurring the runtime overhead of doing needless | |
| 32 | + // digest computation. | |
| 33 | + QPDF_DLL | |
| 34 | + void enable(bool enabled); | |
| 35 | + // If persistAcrossFinish is called, calls to finish do not | |
| 36 | + // finalize the underlying md5 object. In this case, the object is | |
| 37 | + // not finalized until getHexDigest() is called. | |
| 38 | + QPDF_DLL | |
| 39 | + void persistAcrossFinish(bool); | |
| 28 | 40 | |
| 29 | 41 | private: |
| 30 | 42 | bool in_progress; |
| 31 | 43 | MD5 md5; |
| 44 | + bool enabled; | |
| 45 | + bool persist_across_finish; | |
| 32 | 46 | }; |
| 33 | 47 | |
| 34 | 48 | #endif // __PL_MD5_HH__ | ... | ... |
libtests/md5.cc
| ... | ... | @@ -45,6 +45,13 @@ int main(int, char*[]) |
| 45 | 45 | |
| 46 | 46 | Pl_Discard d; |
| 47 | 47 | Pl_MD5 p("MD5", &d); |
| 48 | + // Create a second pipeline, protect against finish, and call | |
| 49 | + // getHexDigest only once at the end of both passes. Make sure the | |
| 50 | + // checksum is that of the input file concatenated to itself. This | |
| 51 | + // will require changes to Pl_MD5.cc to prevent finish from | |
| 52 | + // calling finalize. | |
| 53 | + Pl_MD5 p2("MD5", &d); | |
| 54 | + p2.persistAcrossFinish(true); | |
| 48 | 55 | for (int i = 0; i < 2; ++i) |
| 49 | 56 | { |
| 50 | 57 | FILE* f = QUtil::safe_fopen("md5.in", "rb"); |
| ... | ... | @@ -61,12 +68,23 @@ int main(int, char*[]) |
| 61 | 68 | else |
| 62 | 69 | { |
| 63 | 70 | p.write(buf, len); |
| 71 | + p2.write(buf, len); | |
| 72 | + if (i == 1) | |
| 73 | + { | |
| 74 | + // Partial digest -- resets after each call to write | |
| 75 | + std::cout << p.getHexDigest() << std::endl; | |
| 76 | + } | |
| 64 | 77 | } |
| 65 | 78 | } |
| 66 | 79 | fclose(f); |
| 67 | 80 | p.finish(); |
| 81 | + p2.finish(); | |
| 82 | + // Make sure calling getHexDigest twice with no intervening | |
| 83 | + // writes results in the same result each time. | |
| 84 | + std::cout << p.getHexDigest() << std::endl; | |
| 68 | 85 | std::cout << p.getHexDigest() << std::endl; |
| 69 | 86 | } |
| 87 | + std::cout << p2.getHexDigest() << std::endl; | |
| 70 | 88 | |
| 71 | 89 | return 0; |
| 72 | 90 | } | ... | ... |
libtests/qtest/md5/md5.out
| ... | ... | @@ -14,3 +14,11 @@ d174ab98d277d9f5a5611c2c9f419d9f |
| 14 | 14 | 0 |
| 15 | 15 | 5f4b4321873433daae578f85c72f9e74 |
| 16 | 16 | 5f4b4321873433daae578f85c72f9e74 |
| 17 | +41f977636f79cf1bad1b439caa7d627c | |
| 18 | +c30e03b5536e37306df25489622e13e3 | |
| 19 | +9dabbd135cc47bb603a94989df37c926 | |
| 20 | +ce80591b269b749f65c53b71d0be5212 | |
| 21 | +db5448be0a1e931cbd84654e82063483 | |
| 22 | +db5448be0a1e931cbd84654e82063483 | |
| 23 | +db5448be0a1e931cbd84654e82063483 | |
| 24 | +9833b12b21147bebb2f33d35807049af | ... | ... |
manual/qpdf-manual.xml
| ... | ... | @@ -991,11 +991,30 @@ outfile.pdf</option> |
| 991 | 991 | file should be given. The following options are available: |
| 992 | 992 | <variablelist> |
| 993 | 993 | <varlistentry> |
| 994 | + <term><option>--deterministic-id</option></term> | |
| 995 | + <listitem> | |
| 996 | + <para> | |
| 997 | + Causes generation of a deterministic value for /ID. This | |
| 998 | + prevents use of timestamp and output file name information in | |
| 999 | + the /ID generation. Instead, at some slight additional runtime | |
| 1000 | + cost, the /ID field is generated to include a digest of the | |
| 1001 | + significant parts of the content of the output PDF file. This | |
| 1002 | + means that a given qpdf operation should generate the same /ID | |
| 1003 | + each time it is run, which can be useful when caching results | |
| 1004 | + or for generation of some test data. Use of this flag is not | |
| 1005 | + compatible with creation of encrypted files. | |
| 1006 | + </para> | |
| 1007 | + </listitem> | |
| 1008 | + </varlistentry> | |
| 1009 | + <varlistentry> | |
| 994 | 1010 | <term><option>--static-id</option></term> |
| 995 | 1011 | <listitem> |
| 996 | 1012 | <para> |
| 997 | - Causes generation of a fixed value for /ID. This is intended | |
| 998 | - for testing only. Never use it for production files. | |
| 1013 | + Causes generation of a fixed value for /ID. This is intended | |
| 1014 | + for testing only. Never use it for production files. If you | |
| 1015 | + are trying to get the same /ID each time for a given file and | |
| 1016 | + you are not generating encrypted files, consider using the | |
| 1017 | + <option>--deterministic-id</option> option. | |
| 999 | 1018 | </para> |
| 1000 | 1019 | </listitem> |
| 1001 | 1020 | </varlistentry> | ... | ... |
qpdf/qpdf-ctest.c
| ... | ... | @@ -427,6 +427,18 @@ static void test18(char const* infile, |
| 427 | 427 | report_errors(); |
| 428 | 428 | } |
| 429 | 429 | |
| 430 | +static void test19(char const* infile, | |
| 431 | + char const* password, | |
| 432 | + char const* outfile, | |
| 433 | + char const* outfile2) | |
| 434 | +{ | |
| 435 | + qpdf_read(qpdf, infile, password); | |
| 436 | + qpdf_init_write(qpdf, outfile); | |
| 437 | + qpdf_set_deterministic_ID(qpdf, QPDF_TRUE); | |
| 438 | + qpdf_write(qpdf); | |
| 439 | + report_errors(); | |
| 440 | +} | |
| 441 | + | |
| 430 | 442 | int main(int argc, char* argv[]) |
| 431 | 443 | { |
| 432 | 444 | char* p = 0; |
| ... | ... | @@ -485,6 +497,7 @@ int main(int argc, char* argv[]) |
| 485 | 497 | (n == 16) ? test16 : |
| 486 | 498 | (n == 17) ? test17 : |
| 487 | 499 | (n == 18) ? test18 : |
| 500 | + (n == 19) ? test19 : | |
| 488 | 501 | 0); |
| 489 | 502 | |
| 490 | 503 | if (fn == 0) | ... | ... |
qpdf/qpdf.cc
| ... | ... | @@ -237,6 +237,7 @@ Testing, Inspection, and Debugging Options\n\ |
| 237 | 237 | These options can be useful for digging into PDF files or for use in\n\ |
| 238 | 238 | automated test suites for software that uses the qpdf library.\n\ |
| 239 | 239 | \n\ |
| 240 | +--deterministic-id generate deterministic /ID\n\ | |
| 240 | 241 | --static-id generate static /ID: FOR TESTING ONLY!\n\ |
| 241 | 242 | --static-aes-iv use a static initialization vector for AES-CBC\n\ |
| 242 | 243 | This is option is not secure! FOR TESTING ONLY!\n\ |
| ... | ... | @@ -1031,6 +1032,7 @@ int main(int argc, char* argv[]) |
| 1031 | 1032 | std::string force_version; |
| 1032 | 1033 | |
| 1033 | 1034 | bool show_npages = false; |
| 1035 | + bool deterministic_id = false; | |
| 1034 | 1036 | bool static_id = false; |
| 1035 | 1037 | bool static_aes_iv = false; |
| 1036 | 1038 | bool suppress_original_object_id = false; |
| ... | ... | @@ -1229,6 +1231,10 @@ int main(int argc, char* argv[]) |
| 1229 | 1231 | } |
| 1230 | 1232 | force_version = parameter; |
| 1231 | 1233 | } |
| 1234 | + else if (strcmp(arg, "deterministic-id") == 0) | |
| 1235 | + { | |
| 1236 | + deterministic_id = true; | |
| 1237 | + } | |
| 1232 | 1238 | else if (strcmp(arg, "static-id") == 0) |
| 1233 | 1239 | { |
| 1234 | 1240 | static_id = true; |
| ... | ... | @@ -1710,6 +1716,10 @@ int main(int argc, char* argv[]) |
| 1710 | 1716 | { |
| 1711 | 1717 | w.setPreserveEncryption(false); |
| 1712 | 1718 | } |
| 1719 | + if (deterministic_id) | |
| 1720 | + { | |
| 1721 | + w.setDeterministicID(true); | |
| 1722 | + } | |
| 1713 | 1723 | if (static_id) |
| 1714 | 1724 | { |
| 1715 | 1725 | w.setStaticID(true); | ... | ... |
qpdf/qpdf.testcov
| ... | ... | @@ -269,3 +269,7 @@ qpdf pages range omitted at end 0 |
| 269 | 269 | qpdf pages range omitted in middle 0 |
| 270 | 270 | qpdf npages 0 |
| 271 | 271 | QPDF already reserved object 0 |
| 272 | +QPDFWriter standard deterministic ID 1 | |
| 273 | +QPDFWriter linearized deterministic ID 1 | |
| 274 | +QPDFWriter deterministic with no data 0 | |
| 275 | +qpdf-c called qpdf_set_deterministic_ID 0 | ... | ... |
qpdf/qtest/qpdf.test
| ... | ... | @@ -990,6 +990,43 @@ $td->runtest("write damaged", |
| 990 | 990 | |
| 991 | 991 | show_ntests(); |
| 992 | 992 | # ---------- |
| 993 | +$td->notify("--- Deterministic ID Tests ---"); | |
| 994 | +$n_tests += 11; | |
| 995 | +foreach my $d ('nn', 'ny', 'yn', 'yy') | |
| 996 | +{ | |
| 997 | + my $linearize = ($d =~ m/^y/); | |
| 998 | + my $ostream = ($d =~ m/y$/); | |
| 999 | + $td->runtest("deterministic ID: linearize/ostream=$d", | |
| 1000 | + {$td->COMMAND => | |
| 1001 | + "qpdf -deterministic-id" . | |
| 1002 | + ($linearize ? " -linearize" : "") . | |
| 1003 | + " -object-streams=" . ($ostream ? "generate" : "disable") . | |
| 1004 | + " deterministic-id-in.pdf a.pdf"}, | |
| 1005 | + {$td->STRING => "", | |
| 1006 | + $td->EXIT_STATUS => 0}); | |
| 1007 | + $td->runtest("compare files", | |
| 1008 | + {$td->FILE => "a.pdf"}, | |
| 1009 | + {$td->FILE => "deterministic-id-$d.pdf"}); | |
| 1010 | +} | |
| 1011 | + | |
| 1012 | +$td->runtest("deterministic ID with encryption", | |
| 1013 | + {$td->COMMAND => "qpdf -deterministic-id encrypted-with-images.pdf a.pdf"}, | |
| 1014 | + {$td->STRING => "INTERNAL ERROR: QPDFWriter::generateID" . | |
| 1015 | + " has no data for deterministic ID." . | |
| 1016 | + " This may happen if deterministic ID and" . | |
| 1017 | + " file encryption are requested together.\n", | |
| 1018 | + $td->EXIT_STATUS => 2}, | |
| 1019 | + $td->NORMALIZE_NEWLINES); | |
| 1020 | +$td->runtest("deterministic ID (C API)", | |
| 1021 | + {$td->COMMAND => | |
| 1022 | + "qpdf-ctest 19 deterministic-id-in.pdf '' a.pdf"}, | |
| 1023 | + {$td->STRING => "", | |
| 1024 | + $td->EXIT_STATUS => 0}); | |
| 1025 | +$td->runtest("compare files", | |
| 1026 | + {$td->FILE => "a.pdf"}, | |
| 1027 | + {$td->FILE => "deterministic-id-nn.pdf"}); | |
| 1028 | + | |
| 1029 | +# ---------- | |
| 993 | 1030 | $td->notify("--- Object Stream Tests ---"); |
| 994 | 1031 | $n_tests += (36 * 4) + (12 * 2); |
| 995 | 1032 | $n_compare_pdfs += 36; | ... | ... |
qpdf/qtest/qpdf/deterministic-id-in.pdf
0 → 100644
No preview for this file type
qpdf/qtest/qpdf/deterministic-id-nn.pdf
0 → 100644
No preview for this file type
qpdf/qtest/qpdf/deterministic-id-ny.pdf
0 → 100644
No preview for this file type
qpdf/qtest/qpdf/deterministic-id-yn.pdf
0 → 100644
No preview for this file type
qpdf/qtest/qpdf/deterministic-id-yy.pdf
0 → 100644
No preview for this file type