Commit 1d3fa9124503ed4004851b5bd4033bc1f00268ed
1 parent
c6fbeb87
Refactor object reading logic in `QPDF` for clarity and improved maintainability
Simplify `readObjectAtOffset` by splitting responsibilities into separate functions: `read_object_start` and a streamlined `readObjectAtOffset`. Remove redundant parameters and improve code readability, ensuring better modularity for object reading operations.
Showing
3 changed files
with
137 additions
and
135 deletions
include/qpdf/QPDF.hh
| @@ -795,13 +795,14 @@ class QPDF | @@ -795,13 +795,14 @@ class QPDF | ||
| 795 | std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset); | 795 | std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset); |
| 796 | QPDFTokenizer::Token readToken(InputSource&, size_t max_len = 0); | 796 | QPDFTokenizer::Token readToken(InputSource&, size_t max_len = 0); |
| 797 | 797 | ||
| 798 | - QPDFObjectHandle readObjectAtOffset( | 798 | + QPDFObjGen read_object_start(qpdf_offset_t offset); |
| 799 | + void readObjectAtOffset( | ||
| 799 | bool attempt_recovery, | 800 | bool attempt_recovery, |
| 800 | qpdf_offset_t offset, | 801 | qpdf_offset_t offset, |
| 801 | std::string const& description, | 802 | std::string const& description, |
| 802 | - QPDFObjGen exp_og, | ||
| 803 | - QPDFObjGen& og, | ||
| 804 | - bool skip_cache_if_in_xref); | 803 | + QPDFObjGen exp_og); |
| 804 | + QPDFObjectHandle readObjectAtOffset( | ||
| 805 | + qpdf_offset_t offset, std::string const& description, bool skip_cache_if_in_xref); | ||
| 805 | std::shared_ptr<QPDFObject> const& resolve(QPDFObjGen og); | 806 | std::shared_ptr<QPDFObject> const& resolve(QPDFObjGen og); |
| 806 | void resolveObjectsInStream(int obj_stream_number); | 807 | void resolveObjectsInStream(int obj_stream_number); |
| 807 | void stopOnError(std::string const& message); | 808 | void stopOnError(std::string const& message); |
libqpdf/QPDF_linearization.cc
| @@ -274,10 +274,8 @@ QPDF::readLinearizationData() | @@ -274,10 +274,8 @@ QPDF::readLinearizationData() | ||
| 274 | QPDFObjectHandle | 274 | QPDFObjectHandle |
| 275 | QPDF::readHintStream(Pipeline& pl, qpdf_offset_t offset, size_t length) | 275 | QPDF::readHintStream(Pipeline& pl, qpdf_offset_t offset, size_t length) |
| 276 | { | 276 | { |
| 277 | - QPDFObjGen og; | ||
| 278 | - QPDFObjectHandle H = | ||
| 279 | - readObjectAtOffset(false, offset, "linearization hint stream", QPDFObjGen(0, 0), og, false); | ||
| 280 | - ObjCache& oc = m->obj_cache[og]; | 277 | + auto H = readObjectAtOffset(offset, "linearization hint stream", false); |
| 278 | + ObjCache& oc = m->obj_cache[H]; | ||
| 281 | qpdf_offset_t min_end_offset = oc.end_before_space; | 279 | qpdf_offset_t min_end_offset = oc.end_before_space; |
| 282 | qpdf_offset_t max_end_offset = oc.end_after_space; | 280 | qpdf_offset_t max_end_offset = oc.end_after_space; |
| 283 | if (!H.isStream()) { | 281 | if (!H.isStream()) { |
libqpdf/QPDF_objects.cc
| @@ -763,12 +763,10 @@ qpdf_offset_t | @@ -763,12 +763,10 @@ qpdf_offset_t | ||
| 763 | QPDF::read_xrefStream(qpdf_offset_t xref_offset, bool in_stream_recovery) | 763 | QPDF::read_xrefStream(qpdf_offset_t xref_offset, bool in_stream_recovery) |
| 764 | { | 764 | { |
| 765 | if (!m->ignore_xref_streams) { | 765 | if (!m->ignore_xref_streams) { |
| 766 | - QPDFObjGen x_og; | ||
| 767 | QPDFObjectHandle xref_obj; | 766 | QPDFObjectHandle xref_obj; |
| 768 | try { | 767 | try { |
| 769 | m->in_read_xref_stream = true; | 768 | m->in_read_xref_stream = true; |
| 770 | - xref_obj = | ||
| 771 | - readObjectAtOffset(false, xref_offset, "xref stream", QPDFObjGen(0, 0), x_og, true); | 769 | + xref_obj = readObjectAtOffset(xref_offset, "xref stream", true); |
| 772 | } catch (QPDFExc&) { | 770 | } catch (QPDFExc&) { |
| 773 | // ignore -- report error below | 771 | // ignore -- report error below |
| 774 | } | 772 | } |
| @@ -1403,26 +1401,45 @@ QPDF::readToken(InputSource& input, size_t max_len) | @@ -1403,26 +1401,45 @@ QPDF::readToken(InputSource& input, size_t max_len) | ||
| 1403 | return m->tokenizer.readToken(input, m->last_object_description, true, max_len); | 1401 | return m->tokenizer.readToken(input, m->last_object_description, true, max_len); |
| 1404 | } | 1402 | } |
| 1405 | 1403 | ||
| 1406 | -QPDFObjectHandle | 1404 | +QPDFObjGen |
| 1405 | +QPDF::read_object_start(qpdf_offset_t offset) | ||
| 1406 | +{ | ||
| 1407 | + m->file->seek(offset, SEEK_SET); | ||
| 1408 | + QPDFTokenizer::Token tobjid = readToken(*m->file); | ||
| 1409 | + bool objidok = tobjid.isInteger(); | ||
| 1410 | + QTC::TC("qpdf", "QPDF check objid", objidok ? 1 : 0); | ||
| 1411 | + if (!objidok) { | ||
| 1412 | + QTC::TC("qpdf", "QPDF expected n n obj"); | ||
| 1413 | + throw damagedPDF(offset, "expected n n obj"); | ||
| 1414 | + } | ||
| 1415 | + QPDFTokenizer::Token tgen = readToken(*m->file); | ||
| 1416 | + bool genok = tgen.isInteger(); | ||
| 1417 | + QTC::TC("qpdf", "QPDF check generation", genok ? 1 : 0); | ||
| 1418 | + if (!genok) { | ||
| 1419 | + throw damagedPDF(offset, "expected n n obj"); | ||
| 1420 | + } | ||
| 1421 | + QPDFTokenizer::Token tobj = readToken(*m->file); | ||
| 1422 | + | ||
| 1423 | + bool objok = tobj.isWord("obj"); | ||
| 1424 | + QTC::TC("qpdf", "QPDF check obj", objok ? 1 : 0); | ||
| 1425 | + | ||
| 1426 | + if (!objok) { | ||
| 1427 | + throw damagedPDF(offset, "expected n n obj"); | ||
| 1428 | + } | ||
| 1429 | + int objid = QUtil::string_to_int(tobjid.getValue().c_str()); | ||
| 1430 | + int generation = QUtil::string_to_int(tgen.getValue().c_str()); | ||
| 1431 | + if (objid == 0) { | ||
| 1432 | + QTC::TC("qpdf", "QPDF object id 0"); | ||
| 1433 | + throw damagedPDF(offset, "object with ID 0"); | ||
| 1434 | + } | ||
| 1435 | + return {objid, generation}; | ||
| 1436 | +} | ||
| 1437 | + | ||
| 1438 | +void | ||
| 1407 | QPDF::readObjectAtOffset( | 1439 | QPDF::readObjectAtOffset( |
| 1408 | - bool try_recovery, | ||
| 1409 | - qpdf_offset_t offset, | ||
| 1410 | - std::string const& description, | ||
| 1411 | - QPDFObjGen exp_og, | ||
| 1412 | - QPDFObjGen& og, | ||
| 1413 | - bool skip_cache_if_in_xref) | 1440 | + bool try_recovery, qpdf_offset_t offset, std::string const& description, QPDFObjGen exp_og) |
| 1414 | { | 1441 | { |
| 1415 | - bool check_og = true; | ||
| 1416 | - if (exp_og.getObj() == 0) { | ||
| 1417 | - // This method uses an expect object ID of 0 to indicate that we don't know or don't care | ||
| 1418 | - // what the actual object ID is at this offset. This is true when we read the xref stream | ||
| 1419 | - // and linearization hint streams. In this case, we don't verify the expect object | ||
| 1420 | - // ID/generation against what was read from the file. There is also no reason to attempt | ||
| 1421 | - // xref recovery if we get a failure in this case since the read attempt was not triggered | ||
| 1422 | - // by an xref lookup. | ||
| 1423 | - check_og = false; | ||
| 1424 | - try_recovery = false; | ||
| 1425 | - } | 1442 | + QPDFObjGen og; |
| 1426 | setLastObjectDescription(description, exp_og); | 1443 | setLastObjectDescription(description, exp_og); |
| 1427 | 1444 | ||
| 1428 | if (!m->attempt_recovery) { | 1445 | if (!m->attempt_recovery) { |
| @@ -1436,40 +1453,12 @@ QPDF::readObjectAtOffset( | @@ -1436,40 +1453,12 @@ QPDF::readObjectAtOffset( | ||
| 1436 | if (offset == 0) { | 1453 | if (offset == 0) { |
| 1437 | QTC::TC("qpdf", "QPDF bogus 0 offset", 0); | 1454 | QTC::TC("qpdf", "QPDF bogus 0 offset", 0); |
| 1438 | warn(damagedPDF(-1, "object has offset 0")); | 1455 | warn(damagedPDF(-1, "object has offset 0")); |
| 1439 | - return QPDFObjectHandle::newNull(); | 1456 | + return; |
| 1440 | } | 1457 | } |
| 1441 | 1458 | ||
| 1442 | - m->file->seek(offset, SEEK_SET); | ||
| 1443 | try { | 1459 | try { |
| 1444 | - QPDFTokenizer::Token tobjid = readToken(*m->file); | ||
| 1445 | - bool objidok = tobjid.isInteger(); | ||
| 1446 | - QTC::TC("qpdf", "QPDF check objid", objidok ? 1 : 0); | ||
| 1447 | - if (!objidok) { | ||
| 1448 | - QTC::TC("qpdf", "QPDF expected n n obj"); | ||
| 1449 | - throw damagedPDF(offset, "expected n n obj"); | ||
| 1450 | - } | ||
| 1451 | - QPDFTokenizer::Token tgen = readToken(*m->file); | ||
| 1452 | - bool genok = tgen.isInteger(); | ||
| 1453 | - QTC::TC("qpdf", "QPDF check generation", genok ? 1 : 0); | ||
| 1454 | - if (!genok) { | ||
| 1455 | - throw damagedPDF(offset, "expected n n obj"); | ||
| 1456 | - } | ||
| 1457 | - QPDFTokenizer::Token tobj = readToken(*m->file); | ||
| 1458 | - | ||
| 1459 | - bool objok = tobj.isWord("obj"); | ||
| 1460 | - QTC::TC("qpdf", "QPDF check obj", objok ? 1 : 0); | ||
| 1461 | - | ||
| 1462 | - if (!objok) { | ||
| 1463 | - throw damagedPDF(offset, "expected n n obj"); | ||
| 1464 | - } | ||
| 1465 | - int objid = QUtil::string_to_int(tobjid.getValue().c_str()); | ||
| 1466 | - int generation = QUtil::string_to_int(tgen.getValue().c_str()); | ||
| 1467 | - og = QPDFObjGen(objid, generation); | ||
| 1468 | - if (objid == 0) { | ||
| 1469 | - QTC::TC("qpdf", "QPDF object id 0"); | ||
| 1470 | - throw damagedPDF(offset, "object with ID 0"); | ||
| 1471 | - } | ||
| 1472 | - if (check_og && (exp_og != og)) { | 1460 | + og = read_object_start(offset); |
| 1461 | + if (exp_og != og) { | ||
| 1473 | QTC::TC("qpdf", "QPDF err wrong objid/generation"); | 1462 | QTC::TC("qpdf", "QPDF err wrong objid/generation"); |
| 1474 | QPDFExc e = damagedPDF(offset, "expected " + exp_og.unparse(' ') + " obj"); | 1463 | QPDFExc e = damagedPDF(offset, "expected " + exp_og.unparse(' ') + " obj"); |
| 1475 | if (try_recovery) { | 1464 | if (try_recovery) { |
| @@ -1481,86 +1470,104 @@ QPDF::readObjectAtOffset( | @@ -1481,86 +1470,104 @@ QPDF::readObjectAtOffset( | ||
| 1481 | } | 1470 | } |
| 1482 | } | 1471 | } |
| 1483 | } catch (QPDFExc& e) { | 1472 | } catch (QPDFExc& e) { |
| 1484 | - if (try_recovery) { | ||
| 1485 | - // Try again after reconstructing xref table | ||
| 1486 | - reconstruct_xref(e); | ||
| 1487 | - if (m->xref_table.contains(exp_og) && m->xref_table[exp_og].getType() == 1) { | ||
| 1488 | - qpdf_offset_t new_offset = m->xref_table[exp_og].getOffset(); | ||
| 1489 | - QPDFObjectHandle result = | ||
| 1490 | - readObjectAtOffset(false, new_offset, description, exp_og, og, false); | ||
| 1491 | - QTC::TC("qpdf", "QPDF recovered in readObjectAtOffset"); | ||
| 1492 | - return result; | ||
| 1493 | - } else { | ||
| 1494 | - QTC::TC("qpdf", "QPDF object gone after xref reconstruction"); | ||
| 1495 | - warn(damagedPDF( | ||
| 1496 | - "", | ||
| 1497 | - -1, | ||
| 1498 | - ("object " + exp_og.unparse(' ') + | ||
| 1499 | - " not found in file after regenerating cross reference " | ||
| 1500 | - "table"))); | ||
| 1501 | - return QPDFObjectHandle::newNull(); | ||
| 1502 | - } | ||
| 1503 | - } else { | 1473 | + if (!try_recovery) { |
| 1504 | throw; | 1474 | throw; |
| 1505 | } | 1475 | } |
| 1476 | + // Try again after reconstructing xref table | ||
| 1477 | + reconstruct_xref(e); | ||
| 1478 | + if (m->xref_table.contains(exp_og) && m->xref_table[exp_og].getType() == 1) { | ||
| 1479 | + qpdf_offset_t new_offset = m->xref_table[exp_og].getOffset(); | ||
| 1480 | + readObjectAtOffset(false, new_offset, description, exp_og); | ||
| 1481 | + QTC::TC("qpdf", "QPDF recovered in readObjectAtOffset"); | ||
| 1482 | + return; | ||
| 1483 | + } | ||
| 1484 | + QTC::TC("qpdf", "QPDF object gone after xref reconstruction"); | ||
| 1485 | + warn(damagedPDF( | ||
| 1486 | + "", | ||
| 1487 | + -1, | ||
| 1488 | + ("object " + exp_og.unparse(' ') + | ||
| 1489 | + " not found in file after regenerating cross reference table"))); | ||
| 1490 | + return; | ||
| 1506 | } | 1491 | } |
| 1507 | 1492 | ||
| 1508 | QPDFObjectHandle oh = readObject(description, og); | 1493 | QPDFObjectHandle oh = readObject(description, og); |
| 1509 | 1494 | ||
| 1510 | - if (isUnresolved(og)) { | ||
| 1511 | - // Store the object in the cache here so it gets cached whether we first know the offset or | ||
| 1512 | - // whether we first know the object ID and generation (in which we case we would get here | ||
| 1513 | - // through resolve). | 1495 | + // Determine the end offset of this object before and after white space. We use these |
| 1496 | + // numbers to validate linearization hint tables. Offsets and lengths of objects may imply | ||
| 1497 | + // the end of an object to be anywhere between these values. | ||
| 1498 | + qpdf_offset_t end_before_space = m->file->tell(); | ||
| 1514 | 1499 | ||
| 1515 | - // Determine the end offset of this object before and after white space. We use these | ||
| 1516 | - // numbers to validate linearization hint tables. Offsets and lengths of objects may imply | ||
| 1517 | - // the end of an object to be anywhere between these values. | ||
| 1518 | - qpdf_offset_t end_before_space = m->file->tell(); | 1500 | + // skip over spaces |
| 1501 | + while (true) { | ||
| 1502 | + char ch; | ||
| 1503 | + if (!m->file->read(&ch, 1)) { | ||
| 1504 | + throw damagedPDF(m->file->tell(), "EOF after endobj"); | ||
| 1505 | + } | ||
| 1506 | + if (!isspace(static_cast<unsigned char>(ch))) { | ||
| 1507 | + m->file->seek(-1, SEEK_CUR); | ||
| 1508 | + break; | ||
| 1509 | + } | ||
| 1510 | + } | ||
| 1511 | + updateCache(og, oh.getObj(), end_before_space, m->file->tell()); | ||
| 1512 | +} | ||
| 1519 | 1513 | ||
| 1520 | - // skip over spaces | ||
| 1521 | - while (true) { | ||
| 1522 | - char ch; | ||
| 1523 | - if (m->file->read(&ch, 1)) { | ||
| 1524 | - if (!isspace(static_cast<unsigned char>(ch))) { | ||
| 1525 | - m->file->seek(-1, SEEK_CUR); | ||
| 1526 | - break; | ||
| 1527 | - } | ||
| 1528 | - } else { | ||
| 1529 | - throw damagedPDF(m->file->tell(), "EOF after endobj"); | ||
| 1530 | - } | 1514 | +QPDFObjectHandle |
| 1515 | +QPDF::readObjectAtOffset( | ||
| 1516 | + qpdf_offset_t offset, std::string const& description, bool skip_cache_if_in_xref) | ||
| 1517 | +{ | ||
| 1518 | + auto og = read_object_start(offset); | ||
| 1519 | + auto oh = readObject(description, og); | ||
| 1520 | + | ||
| 1521 | + if (!isUnresolved(og)) { | ||
| 1522 | + return oh; | ||
| 1523 | + } | ||
| 1524 | + | ||
| 1525 | + if (skip_cache_if_in_xref && m->xref_table.contains(og)) { | ||
| 1526 | + // In the special case of the xref stream and linearization hint tables, the offset comes | ||
| 1527 | + // from another source. For the specific case of xref streams, the xref stream is read and | ||
| 1528 | + // loaded into the object cache very early in parsing. Ordinarily, when a file is updated by | ||
| 1529 | + // appending, items inserted into the xref table in later updates take precedence over | ||
| 1530 | + // earlier items. In the special case of reusing the object number previously used as the | ||
| 1531 | + // xref stream, we have the following order of events: | ||
| 1532 | + // | ||
| 1533 | + // * reused object gets loaded into the xref table | ||
| 1534 | + // * old object is read here while reading xref streams | ||
| 1535 | + // * original xref entry is ignored (since already in xref table) | ||
| 1536 | + // | ||
| 1537 | + // It is the second step that causes a problem. Even though the xref table is correct in | ||
| 1538 | + // this case, the old object is already in the cache and so effectively prevails over the | ||
| 1539 | + // reused object. To work around this issue, we have a special case for the xref stream (via | ||
| 1540 | + // the skip_cache_if_in_xref): if the object is already in the xref stream, don't cache what | ||
| 1541 | + // we read here. | ||
| 1542 | + // | ||
| 1543 | + // It is likely that the same bug may exist for linearization hint tables, but the existing | ||
| 1544 | + // code uses end_before_space and end_after_space from the cache, so fixing that would | ||
| 1545 | + // require more significant rework. The chances of a linearization hint stream being reused | ||
| 1546 | + // seems smaller because the xref stream is probably the highest object in the file and the | ||
| 1547 | + // linearization hint stream would be some random place in the middle, so I'm leaving that | ||
| 1548 | + // bug unfixed for now. If the bug were to be fixed, we could use !check_og in place of | ||
| 1549 | + // skip_cache_if_in_xref. | ||
| 1550 | + QTC::TC("qpdf", "QPDF skipping cache for known unchecked object"); | ||
| 1551 | + return oh; | ||
| 1552 | + } | ||
| 1553 | + | ||
| 1554 | + // Determine the end offset of this object before and after white space. We use these | ||
| 1555 | + // numbers to validate linearization hint tables. Offsets and lengths of objects may imply | ||
| 1556 | + // the end of an object to be anywhere between these values. | ||
| 1557 | + qpdf_offset_t end_before_space = m->file->tell(); | ||
| 1558 | + | ||
| 1559 | + // skip over spaces | ||
| 1560 | + while (true) { | ||
| 1561 | + char ch; | ||
| 1562 | + if (!m->file->read(&ch, 1)) { | ||
| 1563 | + throw damagedPDF(m->file->tell(), "EOF after endobj"); | ||
| 1531 | } | 1564 | } |
| 1532 | - qpdf_offset_t end_after_space = m->file->tell(); | ||
| 1533 | - if (skip_cache_if_in_xref && m->xref_table.contains(og)) { | ||
| 1534 | - // Ordinarily, an object gets read here when resolved through xref table or stream. In | ||
| 1535 | - // the special case of the xref stream and linearization hint tables, the offset comes | ||
| 1536 | - // from another source. For the specific case of xref streams, the xref stream is read | ||
| 1537 | - // and loaded into the object cache very early in parsing. Ordinarily, when a file is | ||
| 1538 | - // updated by appending, items inserted into the xref table in later updates take | ||
| 1539 | - // precedence over earlier items. In the special case of reusing the object number | ||
| 1540 | - // previously used as the xref stream, we have the following order of events: | ||
| 1541 | - // | ||
| 1542 | - // * reused object gets loaded into the xref table | ||
| 1543 | - // * old object is read here while reading xref streams | ||
| 1544 | - // * original xref entry is ignored (since already in xref table) | ||
| 1545 | - // | ||
| 1546 | - // It is the second step that causes a problem. Even though the xref table is correct in | ||
| 1547 | - // this case, the old object is already in the cache and so effectively prevails over | ||
| 1548 | - // the reused object. To work around this issue, we have a special case for the xref | ||
| 1549 | - // stream (via the skip_cache_if_in_xref): if the object is already in the xref stream, | ||
| 1550 | - // don't cache what we read here. | ||
| 1551 | - // | ||
| 1552 | - // It is likely that the same bug may exist for linearization hint tables, but the | ||
| 1553 | - // existing code uses end_before_space and end_after_space from the cache, so fixing | ||
| 1554 | - // that would require more significant rework. The chances of a linearization hint | ||
| 1555 | - // stream being reused seems smaller because the xref stream is probably the highest | ||
| 1556 | - // object in the file and the linearization hint stream would be some random place in | ||
| 1557 | - // the middle, so I'm leaving that bug unfixed for now. If the bug were to be fixed, we | ||
| 1558 | - // could use !check_og in place of skip_cache_if_in_xref. | ||
| 1559 | - QTC::TC("qpdf", "QPDF skipping cache for known unchecked object"); | ||
| 1560 | - } else { | ||
| 1561 | - updateCache(og, oh.getObj(), end_before_space, end_after_space); | 1565 | + if (!isspace(static_cast<unsigned char>(ch))) { |
| 1566 | + m->file->seek(-1, SEEK_CUR); | ||
| 1567 | + break; | ||
| 1562 | } | 1568 | } |
| 1563 | } | 1569 | } |
| 1570 | + updateCache(og, oh.getObj(), end_before_space, m->file->tell()); | ||
| 1564 | 1571 | ||
| 1565 | return oh; | 1572 | return oh; |
| 1566 | } | 1573 | } |
| @@ -1587,12 +1594,8 @@ QPDF::resolve(QPDFObjGen og) | @@ -1587,12 +1594,8 @@ QPDF::resolve(QPDFObjGen og) | ||
| 1587 | try { | 1594 | try { |
| 1588 | switch (entry.getType()) { | 1595 | switch (entry.getType()) { |
| 1589 | case 1: | 1596 | case 1: |
| 1590 | - { | ||
| 1591 | - qpdf_offset_t offset = entry.getOffset(); | ||
| 1592 | - // Object stored in cache by readObjectAtOffset | ||
| 1593 | - QPDFObjGen a_og; | ||
| 1594 | - QPDFObjectHandle oh = readObjectAtOffset(true, offset, "", og, a_og, false); | ||
| 1595 | - } | 1597 | + // Object stored in cache by readObjectAtOffset |
| 1598 | + readObjectAtOffset(true, entry.getOffset(), "", og); | ||
| 1596 | break; | 1599 | break; |
| 1597 | 1600 | ||
| 1598 | case 2: | 1601 | case 2: |