Commit 1d3fa9124503ed4004851b5bd4033bc1f00268ed

Authored by m-holger
1 parent c6fbeb87

Refactor object reading logic in `QPDF` for clarity and improved maintainability

Simplify `readObjectAtOffset` by splitting responsibilities into separate functions: `read_object_start` and a streamlined `readObjectAtOffset`. Remove redundant parameters and improve code readability, ensuring better modularity for object reading operations.
include/qpdf/QPDF.hh
... ... @@ -795,13 +795,14 @@ class QPDF
795 795 std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset);
796 796 QPDFTokenizer::Token readToken(InputSource&, size_t max_len = 0);
797 797  
798   - QPDFObjectHandle readObjectAtOffset(
  798 + QPDFObjGen read_object_start(qpdf_offset_t offset);
  799 + void readObjectAtOffset(
799 800 bool attempt_recovery,
800 801 qpdf_offset_t offset,
801 802 std::string const& description,
802   - QPDFObjGen exp_og,
803   - QPDFObjGen& og,
804   - bool skip_cache_if_in_xref);
  803 + QPDFObjGen exp_og);
  804 + QPDFObjectHandle readObjectAtOffset(
  805 + qpdf_offset_t offset, std::string const& description, bool skip_cache_if_in_xref);
805 806 std::shared_ptr<QPDFObject> const& resolve(QPDFObjGen og);
806 807 void resolveObjectsInStream(int obj_stream_number);
807 808 void stopOnError(std::string const& message);
... ...
libqpdf/QPDF_linearization.cc
... ... @@ -274,10 +274,8 @@ QPDF::readLinearizationData()
274 274 QPDFObjectHandle
275 275 QPDF::readHintStream(Pipeline& pl, qpdf_offset_t offset, size_t length)
276 276 {
277   - QPDFObjGen og;
278   - QPDFObjectHandle H =
279   - readObjectAtOffset(false, offset, "linearization hint stream", QPDFObjGen(0, 0), og, false);
280   - ObjCache& oc = m->obj_cache[og];
  277 + auto H = readObjectAtOffset(offset, "linearization hint stream", false);
  278 + ObjCache& oc = m->obj_cache[H];
281 279 qpdf_offset_t min_end_offset = oc.end_before_space;
282 280 qpdf_offset_t max_end_offset = oc.end_after_space;
283 281 if (!H.isStream()) {
... ...
libqpdf/QPDF_objects.cc
... ... @@ -763,12 +763,10 @@ qpdf_offset_t
763 763 QPDF::read_xrefStream(qpdf_offset_t xref_offset, bool in_stream_recovery)
764 764 {
765 765 if (!m->ignore_xref_streams) {
766   - QPDFObjGen x_og;
767 766 QPDFObjectHandle xref_obj;
768 767 try {
769 768 m->in_read_xref_stream = true;
770   - xref_obj =
771   - readObjectAtOffset(false, xref_offset, "xref stream", QPDFObjGen(0, 0), x_og, true);
  769 + xref_obj = readObjectAtOffset(xref_offset, "xref stream", true);
772 770 } catch (QPDFExc&) {
773 771 // ignore -- report error below
774 772 }
... ... @@ -1403,26 +1401,45 @@ QPDF::readToken(InputSource&amp; input, size_t max_len)
1403 1401 return m->tokenizer.readToken(input, m->last_object_description, true, max_len);
1404 1402 }
1405 1403  
1406   -QPDFObjectHandle
  1404 +QPDFObjGen
  1405 +QPDF::read_object_start(qpdf_offset_t offset)
  1406 +{
  1407 + m->file->seek(offset, SEEK_SET);
  1408 + QPDFTokenizer::Token tobjid = readToken(*m->file);
  1409 + bool objidok = tobjid.isInteger();
  1410 + QTC::TC("qpdf", "QPDF check objid", objidok ? 1 : 0);
  1411 + if (!objidok) {
  1412 + QTC::TC("qpdf", "QPDF expected n n obj");
  1413 + throw damagedPDF(offset, "expected n n obj");
  1414 + }
  1415 + QPDFTokenizer::Token tgen = readToken(*m->file);
  1416 + bool genok = tgen.isInteger();
  1417 + QTC::TC("qpdf", "QPDF check generation", genok ? 1 : 0);
  1418 + if (!genok) {
  1419 + throw damagedPDF(offset, "expected n n obj");
  1420 + }
  1421 + QPDFTokenizer::Token tobj = readToken(*m->file);
  1422 +
  1423 + bool objok = tobj.isWord("obj");
  1424 + QTC::TC("qpdf", "QPDF check obj", objok ? 1 : 0);
  1425 +
  1426 + if (!objok) {
  1427 + throw damagedPDF(offset, "expected n n obj");
  1428 + }
  1429 + int objid = QUtil::string_to_int(tobjid.getValue().c_str());
  1430 + int generation = QUtil::string_to_int(tgen.getValue().c_str());
  1431 + if (objid == 0) {
  1432 + QTC::TC("qpdf", "QPDF object id 0");
  1433 + throw damagedPDF(offset, "object with ID 0");
  1434 + }
  1435 + return {objid, generation};
  1436 +}
  1437 +
  1438 +void
1407 1439 QPDF::readObjectAtOffset(
1408   - bool try_recovery,
1409   - qpdf_offset_t offset,
1410   - std::string const& description,
1411   - QPDFObjGen exp_og,
1412   - QPDFObjGen& og,
1413   - bool skip_cache_if_in_xref)
  1440 + bool try_recovery, qpdf_offset_t offset, std::string const& description, QPDFObjGen exp_og)
1414 1441 {
1415   - bool check_og = true;
1416   - if (exp_og.getObj() == 0) {
1417   - // This method uses an expect object ID of 0 to indicate that we don't know or don't care
1418   - // what the actual object ID is at this offset. This is true when we read the xref stream
1419   - // and linearization hint streams. In this case, we don't verify the expect object
1420   - // ID/generation against what was read from the file. There is also no reason to attempt
1421   - // xref recovery if we get a failure in this case since the read attempt was not triggered
1422   - // by an xref lookup.
1423   - check_og = false;
1424   - try_recovery = false;
1425   - }
  1442 + QPDFObjGen og;
1426 1443 setLastObjectDescription(description, exp_og);
1427 1444  
1428 1445 if (!m->attempt_recovery) {
... ... @@ -1436,40 +1453,12 @@ QPDF::readObjectAtOffset(
1436 1453 if (offset == 0) {
1437 1454 QTC::TC("qpdf", "QPDF bogus 0 offset", 0);
1438 1455 warn(damagedPDF(-1, "object has offset 0"));
1439   - return QPDFObjectHandle::newNull();
  1456 + return;
1440 1457 }
1441 1458  
1442   - m->file->seek(offset, SEEK_SET);
1443 1459 try {
1444   - QPDFTokenizer::Token tobjid = readToken(*m->file);
1445   - bool objidok = tobjid.isInteger();
1446   - QTC::TC("qpdf", "QPDF check objid", objidok ? 1 : 0);
1447   - if (!objidok) {
1448   - QTC::TC("qpdf", "QPDF expected n n obj");
1449   - throw damagedPDF(offset, "expected n n obj");
1450   - }
1451   - QPDFTokenizer::Token tgen = readToken(*m->file);
1452   - bool genok = tgen.isInteger();
1453   - QTC::TC("qpdf", "QPDF check generation", genok ? 1 : 0);
1454   - if (!genok) {
1455   - throw damagedPDF(offset, "expected n n obj");
1456   - }
1457   - QPDFTokenizer::Token tobj = readToken(*m->file);
1458   -
1459   - bool objok = tobj.isWord("obj");
1460   - QTC::TC("qpdf", "QPDF check obj", objok ? 1 : 0);
1461   -
1462   - if (!objok) {
1463   - throw damagedPDF(offset, "expected n n obj");
1464   - }
1465   - int objid = QUtil::string_to_int(tobjid.getValue().c_str());
1466   - int generation = QUtil::string_to_int(tgen.getValue().c_str());
1467   - og = QPDFObjGen(objid, generation);
1468   - if (objid == 0) {
1469   - QTC::TC("qpdf", "QPDF object id 0");
1470   - throw damagedPDF(offset, "object with ID 0");
1471   - }
1472   - if (check_og && (exp_og != og)) {
  1460 + og = read_object_start(offset);
  1461 + if (exp_og != og) {
1473 1462 QTC::TC("qpdf", "QPDF err wrong objid/generation");
1474 1463 QPDFExc e = damagedPDF(offset, "expected " + exp_og.unparse(' ') + " obj");
1475 1464 if (try_recovery) {
... ... @@ -1481,86 +1470,104 @@ QPDF::readObjectAtOffset(
1481 1470 }
1482 1471 }
1483 1472 } catch (QPDFExc& e) {
1484   - if (try_recovery) {
1485   - // Try again after reconstructing xref table
1486   - reconstruct_xref(e);
1487   - if (m->xref_table.contains(exp_og) && m->xref_table[exp_og].getType() == 1) {
1488   - qpdf_offset_t new_offset = m->xref_table[exp_og].getOffset();
1489   - QPDFObjectHandle result =
1490   - readObjectAtOffset(false, new_offset, description, exp_og, og, false);
1491   - QTC::TC("qpdf", "QPDF recovered in readObjectAtOffset");
1492   - return result;
1493   - } else {
1494   - QTC::TC("qpdf", "QPDF object gone after xref reconstruction");
1495   - warn(damagedPDF(
1496   - "",
1497   - -1,
1498   - ("object " + exp_og.unparse(' ') +
1499   - " not found in file after regenerating cross reference "
1500   - "table")));
1501   - return QPDFObjectHandle::newNull();
1502   - }
1503   - } else {
  1473 + if (!try_recovery) {
1504 1474 throw;
1505 1475 }
  1476 + // Try again after reconstructing xref table
  1477 + reconstruct_xref(e);
  1478 + if (m->xref_table.contains(exp_og) && m->xref_table[exp_og].getType() == 1) {
  1479 + qpdf_offset_t new_offset = m->xref_table[exp_og].getOffset();
  1480 + readObjectAtOffset(false, new_offset, description, exp_og);
  1481 + QTC::TC("qpdf", "QPDF recovered in readObjectAtOffset");
  1482 + return;
  1483 + }
  1484 + QTC::TC("qpdf", "QPDF object gone after xref reconstruction");
  1485 + warn(damagedPDF(
  1486 + "",
  1487 + -1,
  1488 + ("object " + exp_og.unparse(' ') +
  1489 + " not found in file after regenerating cross reference table")));
  1490 + return;
1506 1491 }
1507 1492  
1508 1493 QPDFObjectHandle oh = readObject(description, og);
1509 1494  
1510   - if (isUnresolved(og)) {
1511   - // Store the object in the cache here so it gets cached whether we first know the offset or
1512   - // whether we first know the object ID and generation (in which we case we would get here
1513   - // through resolve).
  1495 + // Determine the end offset of this object before and after white space. We use these
  1496 + // numbers to validate linearization hint tables. Offsets and lengths of objects may imply
  1497 + // the end of an object to be anywhere between these values.
  1498 + qpdf_offset_t end_before_space = m->file->tell();
1514 1499  
1515   - // Determine the end offset of this object before and after white space. We use these
1516   - // numbers to validate linearization hint tables. Offsets and lengths of objects may imply
1517   - // the end of an object to be anywhere between these values.
1518   - qpdf_offset_t end_before_space = m->file->tell();
  1500 + // skip over spaces
  1501 + while (true) {
  1502 + char ch;
  1503 + if (!m->file->read(&ch, 1)) {
  1504 + throw damagedPDF(m->file->tell(), "EOF after endobj");
  1505 + }
  1506 + if (!isspace(static_cast<unsigned char>(ch))) {
  1507 + m->file->seek(-1, SEEK_CUR);
  1508 + break;
  1509 + }
  1510 + }
  1511 + updateCache(og, oh.getObj(), end_before_space, m->file->tell());
  1512 +}
1519 1513  
1520   - // skip over spaces
1521   - while (true) {
1522   - char ch;
1523   - if (m->file->read(&ch, 1)) {
1524   - if (!isspace(static_cast<unsigned char>(ch))) {
1525   - m->file->seek(-1, SEEK_CUR);
1526   - break;
1527   - }
1528   - } else {
1529   - throw damagedPDF(m->file->tell(), "EOF after endobj");
1530   - }
  1514 +QPDFObjectHandle
  1515 +QPDF::readObjectAtOffset(
  1516 + qpdf_offset_t offset, std::string const& description, bool skip_cache_if_in_xref)
  1517 +{
  1518 + auto og = read_object_start(offset);
  1519 + auto oh = readObject(description, og);
  1520 +
  1521 + if (!isUnresolved(og)) {
  1522 + return oh;
  1523 + }
  1524 +
  1525 + if (skip_cache_if_in_xref && m->xref_table.contains(og)) {
  1526 + // In the special case of the xref stream and linearization hint tables, the offset comes
  1527 + // from another source. For the specific case of xref streams, the xref stream is read and
  1528 + // loaded into the object cache very early in parsing. Ordinarily, when a file is updated by
  1529 + // appending, items inserted into the xref table in later updates take precedence over
  1530 + // earlier items. In the special case of reusing the object number previously used as the
  1531 + // xref stream, we have the following order of events:
  1532 + //
  1533 + // * reused object gets loaded into the xref table
  1534 + // * old object is read here while reading xref streams
  1535 + // * original xref entry is ignored (since already in xref table)
  1536 + //
  1537 + // It is the second step that causes a problem. Even though the xref table is correct in
  1538 + // this case, the old object is already in the cache and so effectively prevails over the
  1539 + // reused object. To work around this issue, we have a special case for the xref stream (via
  1540 + // the skip_cache_if_in_xref): if the object is already in the xref stream, don't cache what
  1541 + // we read here.
  1542 + //
  1543 + // It is likely that the same bug may exist for linearization hint tables, but the existing
  1544 + // code uses end_before_space and end_after_space from the cache, so fixing that would
  1545 + // require more significant rework. The chances of a linearization hint stream being reused
  1546 + // seems smaller because the xref stream is probably the highest object in the file and the
  1547 + // linearization hint stream would be some random place in the middle, so I'm leaving that
  1548 + // bug unfixed for now. If the bug were to be fixed, we could use !check_og in place of
  1549 + // skip_cache_if_in_xref.
  1550 + QTC::TC("qpdf", "QPDF skipping cache for known unchecked object");
  1551 + return oh;
  1552 + }
  1553 +
  1554 + // Determine the end offset of this object before and after white space. We use these
  1555 + // numbers to validate linearization hint tables. Offsets and lengths of objects may imply
  1556 + // the end of an object to be anywhere between these values.
  1557 + qpdf_offset_t end_before_space = m->file->tell();
  1558 +
  1559 + // skip over spaces
  1560 + while (true) {
  1561 + char ch;
  1562 + if (!m->file->read(&ch, 1)) {
  1563 + throw damagedPDF(m->file->tell(), "EOF after endobj");
1531 1564 }
1532   - qpdf_offset_t end_after_space = m->file->tell();
1533   - if (skip_cache_if_in_xref && m->xref_table.contains(og)) {
1534   - // Ordinarily, an object gets read here when resolved through xref table or stream. In
1535   - // the special case of the xref stream and linearization hint tables, the offset comes
1536   - // from another source. For the specific case of xref streams, the xref stream is read
1537   - // and loaded into the object cache very early in parsing. Ordinarily, when a file is
1538   - // updated by appending, items inserted into the xref table in later updates take
1539   - // precedence over earlier items. In the special case of reusing the object number
1540   - // previously used as the xref stream, we have the following order of events:
1541   - //
1542   - // * reused object gets loaded into the xref table
1543   - // * old object is read here while reading xref streams
1544   - // * original xref entry is ignored (since already in xref table)
1545   - //
1546   - // It is the second step that causes a problem. Even though the xref table is correct in
1547   - // this case, the old object is already in the cache and so effectively prevails over
1548   - // the reused object. To work around this issue, we have a special case for the xref
1549   - // stream (via the skip_cache_if_in_xref): if the object is already in the xref stream,
1550   - // don't cache what we read here.
1551   - //
1552   - // It is likely that the same bug may exist for linearization hint tables, but the
1553   - // existing code uses end_before_space and end_after_space from the cache, so fixing
1554   - // that would require more significant rework. The chances of a linearization hint
1555   - // stream being reused seems smaller because the xref stream is probably the highest
1556   - // object in the file and the linearization hint stream would be some random place in
1557   - // the middle, so I'm leaving that bug unfixed for now. If the bug were to be fixed, we
1558   - // could use !check_og in place of skip_cache_if_in_xref.
1559   - QTC::TC("qpdf", "QPDF skipping cache for known unchecked object");
1560   - } else {
1561   - updateCache(og, oh.getObj(), end_before_space, end_after_space);
  1565 + if (!isspace(static_cast<unsigned char>(ch))) {
  1566 + m->file->seek(-1, SEEK_CUR);
  1567 + break;
1562 1568 }
1563 1569 }
  1570 + updateCache(og, oh.getObj(), end_before_space, m->file->tell());
1564 1571  
1565 1572 return oh;
1566 1573 }
... ... @@ -1587,12 +1594,8 @@ QPDF::resolve(QPDFObjGen og)
1587 1594 try {
1588 1595 switch (entry.getType()) {
1589 1596 case 1:
1590   - {
1591   - qpdf_offset_t offset = entry.getOffset();
1592   - // Object stored in cache by readObjectAtOffset
1593   - QPDFObjGen a_og;
1594   - QPDFObjectHandle oh = readObjectAtOffset(true, offset, "", og, a_og, false);
1595   - }
  1597 + // Object stored in cache by readObjectAtOffset
  1598 + readObjectAtOffset(true, entry.getOffset(), "", og);
1596 1599 break;
1597 1600  
1598 1601 case 2:
... ...