Commit 1d3fa9124503ed4004851b5bd4033bc1f00268ed
1 parent
c6fbeb87
Refactor object reading logic in `QPDF` for clarity and improved maintainability
Simplify `readObjectAtOffset` by splitting responsibilities into separate functions: `read_object_start` and a streamlined `readObjectAtOffset`. Remove redundant parameters and improve code readability, ensuring better modularity for object reading operations.
Showing
3 changed files
with
137 additions
and
135 deletions
include/qpdf/QPDF.hh
| ... | ... | @@ -795,13 +795,14 @@ class QPDF |
| 795 | 795 | std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset); |
| 796 | 796 | QPDFTokenizer::Token readToken(InputSource&, size_t max_len = 0); |
| 797 | 797 | |
| 798 | - QPDFObjectHandle readObjectAtOffset( | |
| 798 | + QPDFObjGen read_object_start(qpdf_offset_t offset); | |
| 799 | + void readObjectAtOffset( | |
| 799 | 800 | bool attempt_recovery, |
| 800 | 801 | qpdf_offset_t offset, |
| 801 | 802 | std::string const& description, |
| 802 | - QPDFObjGen exp_og, | |
| 803 | - QPDFObjGen& og, | |
| 804 | - bool skip_cache_if_in_xref); | |
| 803 | + QPDFObjGen exp_og); | |
| 804 | + QPDFObjectHandle readObjectAtOffset( | |
| 805 | + qpdf_offset_t offset, std::string const& description, bool skip_cache_if_in_xref); | |
| 805 | 806 | std::shared_ptr<QPDFObject> const& resolve(QPDFObjGen og); |
| 806 | 807 | void resolveObjectsInStream(int obj_stream_number); |
| 807 | 808 | void stopOnError(std::string const& message); | ... | ... |
libqpdf/QPDF_linearization.cc
| ... | ... | @@ -274,10 +274,8 @@ QPDF::readLinearizationData() |
| 274 | 274 | QPDFObjectHandle |
| 275 | 275 | QPDF::readHintStream(Pipeline& pl, qpdf_offset_t offset, size_t length) |
| 276 | 276 | { |
| 277 | - QPDFObjGen og; | |
| 278 | - QPDFObjectHandle H = | |
| 279 | - readObjectAtOffset(false, offset, "linearization hint stream", QPDFObjGen(0, 0), og, false); | |
| 280 | - ObjCache& oc = m->obj_cache[og]; | |
| 277 | + auto H = readObjectAtOffset(offset, "linearization hint stream", false); | |
| 278 | + ObjCache& oc = m->obj_cache[H]; | |
| 281 | 279 | qpdf_offset_t min_end_offset = oc.end_before_space; |
| 282 | 280 | qpdf_offset_t max_end_offset = oc.end_after_space; |
| 283 | 281 | if (!H.isStream()) { | ... | ... |
libqpdf/QPDF_objects.cc
| ... | ... | @@ -763,12 +763,10 @@ qpdf_offset_t |
| 763 | 763 | QPDF::read_xrefStream(qpdf_offset_t xref_offset, bool in_stream_recovery) |
| 764 | 764 | { |
| 765 | 765 | if (!m->ignore_xref_streams) { |
| 766 | - QPDFObjGen x_og; | |
| 767 | 766 | QPDFObjectHandle xref_obj; |
| 768 | 767 | try { |
| 769 | 768 | m->in_read_xref_stream = true; |
| 770 | - xref_obj = | |
| 771 | - readObjectAtOffset(false, xref_offset, "xref stream", QPDFObjGen(0, 0), x_og, true); | |
| 769 | + xref_obj = readObjectAtOffset(xref_offset, "xref stream", true); | |
| 772 | 770 | } catch (QPDFExc&) { |
| 773 | 771 | // ignore -- report error below |
| 774 | 772 | } |
| ... | ... | @@ -1403,26 +1401,45 @@ QPDF::readToken(InputSource& input, size_t max_len) |
| 1403 | 1401 | return m->tokenizer.readToken(input, m->last_object_description, true, max_len); |
| 1404 | 1402 | } |
| 1405 | 1403 | |
| 1406 | -QPDFObjectHandle | |
| 1404 | +QPDFObjGen | |
| 1405 | +QPDF::read_object_start(qpdf_offset_t offset) | |
| 1406 | +{ | |
| 1407 | + m->file->seek(offset, SEEK_SET); | |
| 1408 | + QPDFTokenizer::Token tobjid = readToken(*m->file); | |
| 1409 | + bool objidok = tobjid.isInteger(); | |
| 1410 | + QTC::TC("qpdf", "QPDF check objid", objidok ? 1 : 0); | |
| 1411 | + if (!objidok) { | |
| 1412 | + QTC::TC("qpdf", "QPDF expected n n obj"); | |
| 1413 | + throw damagedPDF(offset, "expected n n obj"); | |
| 1414 | + } | |
| 1415 | + QPDFTokenizer::Token tgen = readToken(*m->file); | |
| 1416 | + bool genok = tgen.isInteger(); | |
| 1417 | + QTC::TC("qpdf", "QPDF check generation", genok ? 1 : 0); | |
| 1418 | + if (!genok) { | |
| 1419 | + throw damagedPDF(offset, "expected n n obj"); | |
| 1420 | + } | |
| 1421 | + QPDFTokenizer::Token tobj = readToken(*m->file); | |
| 1422 | + | |
| 1423 | + bool objok = tobj.isWord("obj"); | |
| 1424 | + QTC::TC("qpdf", "QPDF check obj", objok ? 1 : 0); | |
| 1425 | + | |
| 1426 | + if (!objok) { | |
| 1427 | + throw damagedPDF(offset, "expected n n obj"); | |
| 1428 | + } | |
| 1429 | + int objid = QUtil::string_to_int(tobjid.getValue().c_str()); | |
| 1430 | + int generation = QUtil::string_to_int(tgen.getValue().c_str()); | |
| 1431 | + if (objid == 0) { | |
| 1432 | + QTC::TC("qpdf", "QPDF object id 0"); | |
| 1433 | + throw damagedPDF(offset, "object with ID 0"); | |
| 1434 | + } | |
| 1435 | + return {objid, generation}; | |
| 1436 | +} | |
| 1437 | + | |
| 1438 | +void | |
| 1407 | 1439 | QPDF::readObjectAtOffset( |
| 1408 | - bool try_recovery, | |
| 1409 | - qpdf_offset_t offset, | |
| 1410 | - std::string const& description, | |
| 1411 | - QPDFObjGen exp_og, | |
| 1412 | - QPDFObjGen& og, | |
| 1413 | - bool skip_cache_if_in_xref) | |
| 1440 | + bool try_recovery, qpdf_offset_t offset, std::string const& description, QPDFObjGen exp_og) | |
| 1414 | 1441 | { |
| 1415 | - bool check_og = true; | |
| 1416 | - if (exp_og.getObj() == 0) { | |
| 1417 | - // This method uses an expect object ID of 0 to indicate that we don't know or don't care | |
| 1418 | - // what the actual object ID is at this offset. This is true when we read the xref stream | |
| 1419 | - // and linearization hint streams. In this case, we don't verify the expect object | |
| 1420 | - // ID/generation against what was read from the file. There is also no reason to attempt | |
| 1421 | - // xref recovery if we get a failure in this case since the read attempt was not triggered | |
| 1422 | - // by an xref lookup. | |
| 1423 | - check_og = false; | |
| 1424 | - try_recovery = false; | |
| 1425 | - } | |
| 1442 | + QPDFObjGen og; | |
| 1426 | 1443 | setLastObjectDescription(description, exp_og); |
| 1427 | 1444 | |
| 1428 | 1445 | if (!m->attempt_recovery) { |
| ... | ... | @@ -1436,40 +1453,12 @@ QPDF::readObjectAtOffset( |
| 1436 | 1453 | if (offset == 0) { |
| 1437 | 1454 | QTC::TC("qpdf", "QPDF bogus 0 offset", 0); |
| 1438 | 1455 | warn(damagedPDF(-1, "object has offset 0")); |
| 1439 | - return QPDFObjectHandle::newNull(); | |
| 1456 | + return; | |
| 1440 | 1457 | } |
| 1441 | 1458 | |
| 1442 | - m->file->seek(offset, SEEK_SET); | |
| 1443 | 1459 | try { |
| 1444 | - QPDFTokenizer::Token tobjid = readToken(*m->file); | |
| 1445 | - bool objidok = tobjid.isInteger(); | |
| 1446 | - QTC::TC("qpdf", "QPDF check objid", objidok ? 1 : 0); | |
| 1447 | - if (!objidok) { | |
| 1448 | - QTC::TC("qpdf", "QPDF expected n n obj"); | |
| 1449 | - throw damagedPDF(offset, "expected n n obj"); | |
| 1450 | - } | |
| 1451 | - QPDFTokenizer::Token tgen = readToken(*m->file); | |
| 1452 | - bool genok = tgen.isInteger(); | |
| 1453 | - QTC::TC("qpdf", "QPDF check generation", genok ? 1 : 0); | |
| 1454 | - if (!genok) { | |
| 1455 | - throw damagedPDF(offset, "expected n n obj"); | |
| 1456 | - } | |
| 1457 | - QPDFTokenizer::Token tobj = readToken(*m->file); | |
| 1458 | - | |
| 1459 | - bool objok = tobj.isWord("obj"); | |
| 1460 | - QTC::TC("qpdf", "QPDF check obj", objok ? 1 : 0); | |
| 1461 | - | |
| 1462 | - if (!objok) { | |
| 1463 | - throw damagedPDF(offset, "expected n n obj"); | |
| 1464 | - } | |
| 1465 | - int objid = QUtil::string_to_int(tobjid.getValue().c_str()); | |
| 1466 | - int generation = QUtil::string_to_int(tgen.getValue().c_str()); | |
| 1467 | - og = QPDFObjGen(objid, generation); | |
| 1468 | - if (objid == 0) { | |
| 1469 | - QTC::TC("qpdf", "QPDF object id 0"); | |
| 1470 | - throw damagedPDF(offset, "object with ID 0"); | |
| 1471 | - } | |
| 1472 | - if (check_og && (exp_og != og)) { | |
| 1460 | + og = read_object_start(offset); | |
| 1461 | + if (exp_og != og) { | |
| 1473 | 1462 | QTC::TC("qpdf", "QPDF err wrong objid/generation"); |
| 1474 | 1463 | QPDFExc e = damagedPDF(offset, "expected " + exp_og.unparse(' ') + " obj"); |
| 1475 | 1464 | if (try_recovery) { |
| ... | ... | @@ -1481,86 +1470,104 @@ QPDF::readObjectAtOffset( |
| 1481 | 1470 | } |
| 1482 | 1471 | } |
| 1483 | 1472 | } catch (QPDFExc& e) { |
| 1484 | - if (try_recovery) { | |
| 1485 | - // Try again after reconstructing xref table | |
| 1486 | - reconstruct_xref(e); | |
| 1487 | - if (m->xref_table.contains(exp_og) && m->xref_table[exp_og].getType() == 1) { | |
| 1488 | - qpdf_offset_t new_offset = m->xref_table[exp_og].getOffset(); | |
| 1489 | - QPDFObjectHandle result = | |
| 1490 | - readObjectAtOffset(false, new_offset, description, exp_og, og, false); | |
| 1491 | - QTC::TC("qpdf", "QPDF recovered in readObjectAtOffset"); | |
| 1492 | - return result; | |
| 1493 | - } else { | |
| 1494 | - QTC::TC("qpdf", "QPDF object gone after xref reconstruction"); | |
| 1495 | - warn(damagedPDF( | |
| 1496 | - "", | |
| 1497 | - -1, | |
| 1498 | - ("object " + exp_og.unparse(' ') + | |
| 1499 | - " not found in file after regenerating cross reference " | |
| 1500 | - "table"))); | |
| 1501 | - return QPDFObjectHandle::newNull(); | |
| 1502 | - } | |
| 1503 | - } else { | |
| 1473 | + if (!try_recovery) { | |
| 1504 | 1474 | throw; |
| 1505 | 1475 | } |
| 1476 | + // Try again after reconstructing xref table | |
| 1477 | + reconstruct_xref(e); | |
| 1478 | + if (m->xref_table.contains(exp_og) && m->xref_table[exp_og].getType() == 1) { | |
| 1479 | + qpdf_offset_t new_offset = m->xref_table[exp_og].getOffset(); | |
| 1480 | + readObjectAtOffset(false, new_offset, description, exp_og); | |
| 1481 | + QTC::TC("qpdf", "QPDF recovered in readObjectAtOffset"); | |
| 1482 | + return; | |
| 1483 | + } | |
| 1484 | + QTC::TC("qpdf", "QPDF object gone after xref reconstruction"); | |
| 1485 | + warn(damagedPDF( | |
| 1486 | + "", | |
| 1487 | + -1, | |
| 1488 | + ("object " + exp_og.unparse(' ') + | |
| 1489 | + " not found in file after regenerating cross reference table"))); | |
| 1490 | + return; | |
| 1506 | 1491 | } |
| 1507 | 1492 | |
| 1508 | 1493 | QPDFObjectHandle oh = readObject(description, og); |
| 1509 | 1494 | |
| 1510 | - if (isUnresolved(og)) { | |
| 1511 | - // Store the object in the cache here so it gets cached whether we first know the offset or | |
| 1512 | - // whether we first know the object ID and generation (in which we case we would get here | |
| 1513 | - // through resolve). | |
| 1495 | + // Determine the end offset of this object before and after white space. We use these | |
| 1496 | + // numbers to validate linearization hint tables. Offsets and lengths of objects may imply | |
| 1497 | + // the end of an object to be anywhere between these values. | |
| 1498 | + qpdf_offset_t end_before_space = m->file->tell(); | |
| 1514 | 1499 | |
| 1515 | - // Determine the end offset of this object before and after white space. We use these | |
| 1516 | - // numbers to validate linearization hint tables. Offsets and lengths of objects may imply | |
| 1517 | - // the end of an object to be anywhere between these values. | |
| 1518 | - qpdf_offset_t end_before_space = m->file->tell(); | |
| 1500 | + // skip over spaces | |
| 1501 | + while (true) { | |
| 1502 | + char ch; | |
| 1503 | + if (!m->file->read(&ch, 1)) { | |
| 1504 | + throw damagedPDF(m->file->tell(), "EOF after endobj"); | |
| 1505 | + } | |
| 1506 | + if (!isspace(static_cast<unsigned char>(ch))) { | |
| 1507 | + m->file->seek(-1, SEEK_CUR); | |
| 1508 | + break; | |
| 1509 | + } | |
| 1510 | + } | |
| 1511 | + updateCache(og, oh.getObj(), end_before_space, m->file->tell()); | |
| 1512 | +} | |
| 1519 | 1513 | |
| 1520 | - // skip over spaces | |
| 1521 | - while (true) { | |
| 1522 | - char ch; | |
| 1523 | - if (m->file->read(&ch, 1)) { | |
| 1524 | - if (!isspace(static_cast<unsigned char>(ch))) { | |
| 1525 | - m->file->seek(-1, SEEK_CUR); | |
| 1526 | - break; | |
| 1527 | - } | |
| 1528 | - } else { | |
| 1529 | - throw damagedPDF(m->file->tell(), "EOF after endobj"); | |
| 1530 | - } | |
| 1514 | +QPDFObjectHandle | |
| 1515 | +QPDF::readObjectAtOffset( | |
| 1516 | + qpdf_offset_t offset, std::string const& description, bool skip_cache_if_in_xref) | |
| 1517 | +{ | |
| 1518 | + auto og = read_object_start(offset); | |
| 1519 | + auto oh = readObject(description, og); | |
| 1520 | + | |
| 1521 | + if (!isUnresolved(og)) { | |
| 1522 | + return oh; | |
| 1523 | + } | |
| 1524 | + | |
| 1525 | + if (skip_cache_if_in_xref && m->xref_table.contains(og)) { | |
| 1526 | + // In the special case of the xref stream and linearization hint tables, the offset comes | |
| 1527 | + // from another source. For the specific case of xref streams, the xref stream is read and | |
| 1528 | + // loaded into the object cache very early in parsing. Ordinarily, when a file is updated by | |
| 1529 | + // appending, items inserted into the xref table in later updates take precedence over | |
| 1530 | + // earlier items. In the special case of reusing the object number previously used as the | |
| 1531 | + // xref stream, we have the following order of events: | |
| 1532 | + // | |
| 1533 | + // * reused object gets loaded into the xref table | |
| 1534 | + // * old object is read here while reading xref streams | |
| 1535 | + // * original xref entry is ignored (since already in xref table) | |
| 1536 | + // | |
| 1537 | + // It is the second step that causes a problem. Even though the xref table is correct in | |
| 1538 | + // this case, the old object is already in the cache and so effectively prevails over the | |
| 1539 | + // reused object. To work around this issue, we have a special case for the xref stream (via | |
| 1540 | + // the skip_cache_if_in_xref): if the object is already in the xref stream, don't cache what | |
| 1541 | + // we read here. | |
| 1542 | + // | |
| 1543 | + // It is likely that the same bug may exist for linearization hint tables, but the existing | |
| 1544 | + // code uses end_before_space and end_after_space from the cache, so fixing that would | |
| 1545 | + // require more significant rework. The chances of a linearization hint stream being reused | |
| 1546 | + // seems smaller because the xref stream is probably the highest object in the file and the | |
| 1547 | + // linearization hint stream would be some random place in the middle, so I'm leaving that | |
| 1548 | + // bug unfixed for now. If the bug were to be fixed, we could use !check_og in place of | |
| 1549 | + // skip_cache_if_in_xref. | |
| 1550 | + QTC::TC("qpdf", "QPDF skipping cache for known unchecked object"); | |
| 1551 | + return oh; | |
| 1552 | + } | |
| 1553 | + | |
| 1554 | + // Determine the end offset of this object before and after white space. We use these | |
| 1555 | + // numbers to validate linearization hint tables. Offsets and lengths of objects may imply | |
| 1556 | + // the end of an object to be anywhere between these values. | |
| 1557 | + qpdf_offset_t end_before_space = m->file->tell(); | |
| 1558 | + | |
| 1559 | + // skip over spaces | |
| 1560 | + while (true) { | |
| 1561 | + char ch; | |
| 1562 | + if (!m->file->read(&ch, 1)) { | |
| 1563 | + throw damagedPDF(m->file->tell(), "EOF after endobj"); | |
| 1531 | 1564 | } |
| 1532 | - qpdf_offset_t end_after_space = m->file->tell(); | |
| 1533 | - if (skip_cache_if_in_xref && m->xref_table.contains(og)) { | |
| 1534 | - // Ordinarily, an object gets read here when resolved through xref table or stream. In | |
| 1535 | - // the special case of the xref stream and linearization hint tables, the offset comes | |
| 1536 | - // from another source. For the specific case of xref streams, the xref stream is read | |
| 1537 | - // and loaded into the object cache very early in parsing. Ordinarily, when a file is | |
| 1538 | - // updated by appending, items inserted into the xref table in later updates take | |
| 1539 | - // precedence over earlier items. In the special case of reusing the object number | |
| 1540 | - // previously used as the xref stream, we have the following order of events: | |
| 1541 | - // | |
| 1542 | - // * reused object gets loaded into the xref table | |
| 1543 | - // * old object is read here while reading xref streams | |
| 1544 | - // * original xref entry is ignored (since already in xref table) | |
| 1545 | - // | |
| 1546 | - // It is the second step that causes a problem. Even though the xref table is correct in | |
| 1547 | - // this case, the old object is already in the cache and so effectively prevails over | |
| 1548 | - // the reused object. To work around this issue, we have a special case for the xref | |
| 1549 | - // stream (via the skip_cache_if_in_xref): if the object is already in the xref stream, | |
| 1550 | - // don't cache what we read here. | |
| 1551 | - // | |
| 1552 | - // It is likely that the same bug may exist for linearization hint tables, but the | |
| 1553 | - // existing code uses end_before_space and end_after_space from the cache, so fixing | |
| 1554 | - // that would require more significant rework. The chances of a linearization hint | |
| 1555 | - // stream being reused seems smaller because the xref stream is probably the highest | |
| 1556 | - // object in the file and the linearization hint stream would be some random place in | |
| 1557 | - // the middle, so I'm leaving that bug unfixed for now. If the bug were to be fixed, we | |
| 1558 | - // could use !check_og in place of skip_cache_if_in_xref. | |
| 1559 | - QTC::TC("qpdf", "QPDF skipping cache for known unchecked object"); | |
| 1560 | - } else { | |
| 1561 | - updateCache(og, oh.getObj(), end_before_space, end_after_space); | |
| 1565 | + if (!isspace(static_cast<unsigned char>(ch))) { | |
| 1566 | + m->file->seek(-1, SEEK_CUR); | |
| 1567 | + break; | |
| 1562 | 1568 | } |
| 1563 | 1569 | } |
| 1570 | + updateCache(og, oh.getObj(), end_before_space, m->file->tell()); | |
| 1564 | 1571 | |
| 1565 | 1572 | return oh; |
| 1566 | 1573 | } |
| ... | ... | @@ -1587,12 +1594,8 @@ QPDF::resolve(QPDFObjGen og) |
| 1587 | 1594 | try { |
| 1588 | 1595 | switch (entry.getType()) { |
| 1589 | 1596 | case 1: |
| 1590 | - { | |
| 1591 | - qpdf_offset_t offset = entry.getOffset(); | |
| 1592 | - // Object stored in cache by readObjectAtOffset | |
| 1593 | - QPDFObjGen a_og; | |
| 1594 | - QPDFObjectHandle oh = readObjectAtOffset(true, offset, "", og, a_og, false); | |
| 1595 | - } | |
| 1597 | + // Object stored in cache by readObjectAtOffset | |
| 1598 | + readObjectAtOffset(true, entry.getOffset(), "", og); | |
| 1596 | 1599 | break; |
| 1597 | 1600 | |
| 1598 | 1601 | case 2: | ... | ... |