Commit 1d3fa9124503ed4004851b5bd4033bc1f00268ed

Authored by m-holger
1 parent c6fbeb87

Refactor object reading logic in `QPDF` for clarity and improved maintainability

Simplify `readObjectAtOffset` by splitting responsibilities into separate functions: `read_object_start` and a streamlined `readObjectAtOffset`. Remove redundant parameters and improve code readability, ensuring better modularity for object reading operations.
include/qpdf/QPDF.hh
@@ -795,13 +795,14 @@ class QPDF @@ -795,13 +795,14 @@ class QPDF
795 std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset); 795 std::shared_ptr<InputSource> input, QPDFObjGen og, qpdf_offset_t stream_offset);
796 QPDFTokenizer::Token readToken(InputSource&, size_t max_len = 0); 796 QPDFTokenizer::Token readToken(InputSource&, size_t max_len = 0);
797 797
798 - QPDFObjectHandle readObjectAtOffset( 798 + QPDFObjGen read_object_start(qpdf_offset_t offset);
  799 + void readObjectAtOffset(
799 bool attempt_recovery, 800 bool attempt_recovery,
800 qpdf_offset_t offset, 801 qpdf_offset_t offset,
801 std::string const& description, 802 std::string const& description,
802 - QPDFObjGen exp_og,  
803 - QPDFObjGen& og,  
804 - bool skip_cache_if_in_xref); 803 + QPDFObjGen exp_og);
  804 + QPDFObjectHandle readObjectAtOffset(
  805 + qpdf_offset_t offset, std::string const& description, bool skip_cache_if_in_xref);
805 std::shared_ptr<QPDFObject> const& resolve(QPDFObjGen og); 806 std::shared_ptr<QPDFObject> const& resolve(QPDFObjGen og);
806 void resolveObjectsInStream(int obj_stream_number); 807 void resolveObjectsInStream(int obj_stream_number);
807 void stopOnError(std::string const& message); 808 void stopOnError(std::string const& message);
libqpdf/QPDF_linearization.cc
@@ -274,10 +274,8 @@ QPDF::readLinearizationData() @@ -274,10 +274,8 @@ QPDF::readLinearizationData()
274 QPDFObjectHandle 274 QPDFObjectHandle
275 QPDF::readHintStream(Pipeline& pl, qpdf_offset_t offset, size_t length) 275 QPDF::readHintStream(Pipeline& pl, qpdf_offset_t offset, size_t length)
276 { 276 {
277 - QPDFObjGen og;  
278 - QPDFObjectHandle H =  
279 - readObjectAtOffset(false, offset, "linearization hint stream", QPDFObjGen(0, 0), og, false);  
280 - ObjCache& oc = m->obj_cache[og]; 277 + auto H = readObjectAtOffset(offset, "linearization hint stream", false);
  278 + ObjCache& oc = m->obj_cache[H];
281 qpdf_offset_t min_end_offset = oc.end_before_space; 279 qpdf_offset_t min_end_offset = oc.end_before_space;
282 qpdf_offset_t max_end_offset = oc.end_after_space; 280 qpdf_offset_t max_end_offset = oc.end_after_space;
283 if (!H.isStream()) { 281 if (!H.isStream()) {
libqpdf/QPDF_objects.cc
@@ -763,12 +763,10 @@ qpdf_offset_t @@ -763,12 +763,10 @@ qpdf_offset_t
763 QPDF::read_xrefStream(qpdf_offset_t xref_offset, bool in_stream_recovery) 763 QPDF::read_xrefStream(qpdf_offset_t xref_offset, bool in_stream_recovery)
764 { 764 {
765 if (!m->ignore_xref_streams) { 765 if (!m->ignore_xref_streams) {
766 - QPDFObjGen x_og;  
767 QPDFObjectHandle xref_obj; 766 QPDFObjectHandle xref_obj;
768 try { 767 try {
769 m->in_read_xref_stream = true; 768 m->in_read_xref_stream = true;
770 - xref_obj =  
771 - readObjectAtOffset(false, xref_offset, "xref stream", QPDFObjGen(0, 0), x_og, true); 769 + xref_obj = readObjectAtOffset(xref_offset, "xref stream", true);
772 } catch (QPDFExc&) { 770 } catch (QPDFExc&) {
773 // ignore -- report error below 771 // ignore -- report error below
774 } 772 }
@@ -1403,26 +1401,45 @@ QPDF::readToken(InputSource&amp; input, size_t max_len) @@ -1403,26 +1401,45 @@ QPDF::readToken(InputSource&amp; input, size_t max_len)
1403 return m->tokenizer.readToken(input, m->last_object_description, true, max_len); 1401 return m->tokenizer.readToken(input, m->last_object_description, true, max_len);
1404 } 1402 }
1405 1403
1406 -QPDFObjectHandle 1404 +QPDFObjGen
  1405 +QPDF::read_object_start(qpdf_offset_t offset)
  1406 +{
  1407 + m->file->seek(offset, SEEK_SET);
  1408 + QPDFTokenizer::Token tobjid = readToken(*m->file);
  1409 + bool objidok = tobjid.isInteger();
  1410 + QTC::TC("qpdf", "QPDF check objid", objidok ? 1 : 0);
  1411 + if (!objidok) {
  1412 + QTC::TC("qpdf", "QPDF expected n n obj");
  1413 + throw damagedPDF(offset, "expected n n obj");
  1414 + }
  1415 + QPDFTokenizer::Token tgen = readToken(*m->file);
  1416 + bool genok = tgen.isInteger();
  1417 + QTC::TC("qpdf", "QPDF check generation", genok ? 1 : 0);
  1418 + if (!genok) {
  1419 + throw damagedPDF(offset, "expected n n obj");
  1420 + }
  1421 + QPDFTokenizer::Token tobj = readToken(*m->file);
  1422 +
  1423 + bool objok = tobj.isWord("obj");
  1424 + QTC::TC("qpdf", "QPDF check obj", objok ? 1 : 0);
  1425 +
  1426 + if (!objok) {
  1427 + throw damagedPDF(offset, "expected n n obj");
  1428 + }
  1429 + int objid = QUtil::string_to_int(tobjid.getValue().c_str());
  1430 + int generation = QUtil::string_to_int(tgen.getValue().c_str());
  1431 + if (objid == 0) {
  1432 + QTC::TC("qpdf", "QPDF object id 0");
  1433 + throw damagedPDF(offset, "object with ID 0");
  1434 + }
  1435 + return {objid, generation};
  1436 +}
  1437 +
  1438 +void
1407 QPDF::readObjectAtOffset( 1439 QPDF::readObjectAtOffset(
1408 - bool try_recovery,  
1409 - qpdf_offset_t offset,  
1410 - std::string const& description,  
1411 - QPDFObjGen exp_og,  
1412 - QPDFObjGen& og,  
1413 - bool skip_cache_if_in_xref) 1440 + bool try_recovery, qpdf_offset_t offset, std::string const& description, QPDFObjGen exp_og)
1414 { 1441 {
1415 - bool check_og = true;  
1416 - if (exp_og.getObj() == 0) {  
1417 - // This method uses an expect object ID of 0 to indicate that we don't know or don't care  
1418 - // what the actual object ID is at this offset. This is true when we read the xref stream  
1419 - // and linearization hint streams. In this case, we don't verify the expect object  
1420 - // ID/generation against what was read from the file. There is also no reason to attempt  
1421 - // xref recovery if we get a failure in this case since the read attempt was not triggered  
1422 - // by an xref lookup.  
1423 - check_og = false;  
1424 - try_recovery = false;  
1425 - } 1442 + QPDFObjGen og;
1426 setLastObjectDescription(description, exp_og); 1443 setLastObjectDescription(description, exp_og);
1427 1444
1428 if (!m->attempt_recovery) { 1445 if (!m->attempt_recovery) {
@@ -1436,40 +1453,12 @@ QPDF::readObjectAtOffset( @@ -1436,40 +1453,12 @@ QPDF::readObjectAtOffset(
1436 if (offset == 0) { 1453 if (offset == 0) {
1437 QTC::TC("qpdf", "QPDF bogus 0 offset", 0); 1454 QTC::TC("qpdf", "QPDF bogus 0 offset", 0);
1438 warn(damagedPDF(-1, "object has offset 0")); 1455 warn(damagedPDF(-1, "object has offset 0"));
1439 - return QPDFObjectHandle::newNull(); 1456 + return;
1440 } 1457 }
1441 1458
1442 - m->file->seek(offset, SEEK_SET);  
1443 try { 1459 try {
1444 - QPDFTokenizer::Token tobjid = readToken(*m->file);  
1445 - bool objidok = tobjid.isInteger();  
1446 - QTC::TC("qpdf", "QPDF check objid", objidok ? 1 : 0);  
1447 - if (!objidok) {  
1448 - QTC::TC("qpdf", "QPDF expected n n obj");  
1449 - throw damagedPDF(offset, "expected n n obj");  
1450 - }  
1451 - QPDFTokenizer::Token tgen = readToken(*m->file);  
1452 - bool genok = tgen.isInteger();  
1453 - QTC::TC("qpdf", "QPDF check generation", genok ? 1 : 0);  
1454 - if (!genok) {  
1455 - throw damagedPDF(offset, "expected n n obj");  
1456 - }  
1457 - QPDFTokenizer::Token tobj = readToken(*m->file);  
1458 -  
1459 - bool objok = tobj.isWord("obj");  
1460 - QTC::TC("qpdf", "QPDF check obj", objok ? 1 : 0);  
1461 -  
1462 - if (!objok) {  
1463 - throw damagedPDF(offset, "expected n n obj");  
1464 - }  
1465 - int objid = QUtil::string_to_int(tobjid.getValue().c_str());  
1466 - int generation = QUtil::string_to_int(tgen.getValue().c_str());  
1467 - og = QPDFObjGen(objid, generation);  
1468 - if (objid == 0) {  
1469 - QTC::TC("qpdf", "QPDF object id 0");  
1470 - throw damagedPDF(offset, "object with ID 0");  
1471 - }  
1472 - if (check_og && (exp_og != og)) { 1460 + og = read_object_start(offset);
  1461 + if (exp_og != og) {
1473 QTC::TC("qpdf", "QPDF err wrong objid/generation"); 1462 QTC::TC("qpdf", "QPDF err wrong objid/generation");
1474 QPDFExc e = damagedPDF(offset, "expected " + exp_og.unparse(' ') + " obj"); 1463 QPDFExc e = damagedPDF(offset, "expected " + exp_og.unparse(' ') + " obj");
1475 if (try_recovery) { 1464 if (try_recovery) {
@@ -1481,86 +1470,104 @@ QPDF::readObjectAtOffset( @@ -1481,86 +1470,104 @@ QPDF::readObjectAtOffset(
1481 } 1470 }
1482 } 1471 }
1483 } catch (QPDFExc& e) { 1472 } catch (QPDFExc& e) {
1484 - if (try_recovery) {  
1485 - // Try again after reconstructing xref table  
1486 - reconstruct_xref(e);  
1487 - if (m->xref_table.contains(exp_og) && m->xref_table[exp_og].getType() == 1) {  
1488 - qpdf_offset_t new_offset = m->xref_table[exp_og].getOffset();  
1489 - QPDFObjectHandle result =  
1490 - readObjectAtOffset(false, new_offset, description, exp_og, og, false);  
1491 - QTC::TC("qpdf", "QPDF recovered in readObjectAtOffset");  
1492 - return result;  
1493 - } else {  
1494 - QTC::TC("qpdf", "QPDF object gone after xref reconstruction");  
1495 - warn(damagedPDF(  
1496 - "",  
1497 - -1,  
1498 - ("object " + exp_og.unparse(' ') +  
1499 - " not found in file after regenerating cross reference "  
1500 - "table")));  
1501 - return QPDFObjectHandle::newNull();  
1502 - }  
1503 - } else { 1473 + if (!try_recovery) {
1504 throw; 1474 throw;
1505 } 1475 }
  1476 + // Try again after reconstructing xref table
  1477 + reconstruct_xref(e);
  1478 + if (m->xref_table.contains(exp_og) && m->xref_table[exp_og].getType() == 1) {
  1479 + qpdf_offset_t new_offset = m->xref_table[exp_og].getOffset();
  1480 + readObjectAtOffset(false, new_offset, description, exp_og);
  1481 + QTC::TC("qpdf", "QPDF recovered in readObjectAtOffset");
  1482 + return;
  1483 + }
  1484 + QTC::TC("qpdf", "QPDF object gone after xref reconstruction");
  1485 + warn(damagedPDF(
  1486 + "",
  1487 + -1,
  1488 + ("object " + exp_og.unparse(' ') +
  1489 + " not found in file after regenerating cross reference table")));
  1490 + return;
1506 } 1491 }
1507 1492
1508 QPDFObjectHandle oh = readObject(description, og); 1493 QPDFObjectHandle oh = readObject(description, og);
1509 1494
1510 - if (isUnresolved(og)) {  
1511 - // Store the object in the cache here so it gets cached whether we first know the offset or  
1512 - // whether we first know the object ID and generation (in which we case we would get here  
1513 - // through resolve). 1495 + // Determine the end offset of this object before and after white space. We use these
  1496 + // numbers to validate linearization hint tables. Offsets and lengths of objects may imply
  1497 + // the end of an object to be anywhere between these values.
  1498 + qpdf_offset_t end_before_space = m->file->tell();
1514 1499
1515 - // Determine the end offset of this object before and after white space. We use these  
1516 - // numbers to validate linearization hint tables. Offsets and lengths of objects may imply  
1517 - // the end of an object to be anywhere between these values.  
1518 - qpdf_offset_t end_before_space = m->file->tell(); 1500 + // skip over spaces
  1501 + while (true) {
  1502 + char ch;
  1503 + if (!m->file->read(&ch, 1)) {
  1504 + throw damagedPDF(m->file->tell(), "EOF after endobj");
  1505 + }
  1506 + if (!isspace(static_cast<unsigned char>(ch))) {
  1507 + m->file->seek(-1, SEEK_CUR);
  1508 + break;
  1509 + }
  1510 + }
  1511 + updateCache(og, oh.getObj(), end_before_space, m->file->tell());
  1512 +}
1519 1513
1520 - // skip over spaces  
1521 - while (true) {  
1522 - char ch;  
1523 - if (m->file->read(&ch, 1)) {  
1524 - if (!isspace(static_cast<unsigned char>(ch))) {  
1525 - m->file->seek(-1, SEEK_CUR);  
1526 - break;  
1527 - }  
1528 - } else {  
1529 - throw damagedPDF(m->file->tell(), "EOF after endobj");  
1530 - } 1514 +QPDFObjectHandle
  1515 +QPDF::readObjectAtOffset(
  1516 + qpdf_offset_t offset, std::string const& description, bool skip_cache_if_in_xref)
  1517 +{
  1518 + auto og = read_object_start(offset);
  1519 + auto oh = readObject(description, og);
  1520 +
  1521 + if (!isUnresolved(og)) {
  1522 + return oh;
  1523 + }
  1524 +
  1525 + if (skip_cache_if_in_xref && m->xref_table.contains(og)) {
  1526 + // In the special case of the xref stream and linearization hint tables, the offset comes
  1527 + // from another source. For the specific case of xref streams, the xref stream is read and
  1528 + // loaded into the object cache very early in parsing. Ordinarily, when a file is updated by
  1529 + // appending, items inserted into the xref table in later updates take precedence over
  1530 + // earlier items. In the special case of reusing the object number previously used as the
  1531 + // xref stream, we have the following order of events:
  1532 + //
  1533 + // * reused object gets loaded into the xref table
  1534 + // * old object is read here while reading xref streams
  1535 + // * original xref entry is ignored (since already in xref table)
  1536 + //
  1537 + // It is the second step that causes a problem. Even though the xref table is correct in
  1538 + // this case, the old object is already in the cache and so effectively prevails over the
  1539 + // reused object. To work around this issue, we have a special case for the xref stream (via
  1540 + // the skip_cache_if_in_xref): if the object is already in the xref stream, don't cache what
  1541 + // we read here.
  1542 + //
  1543 + // It is likely that the same bug may exist for linearization hint tables, but the existing
  1544 + // code uses end_before_space and end_after_space from the cache, so fixing that would
  1545 + // require more significant rework. The chances of a linearization hint stream being reused
  1546 + // seems smaller because the xref stream is probably the highest object in the file and the
  1547 + // linearization hint stream would be some random place in the middle, so I'm leaving that
  1548 + // bug unfixed for now. If the bug were to be fixed, we could use !check_og in place of
  1549 + // skip_cache_if_in_xref.
  1550 + QTC::TC("qpdf", "QPDF skipping cache for known unchecked object");
  1551 + return oh;
  1552 + }
  1553 +
  1554 + // Determine the end offset of this object before and after white space. We use these
  1555 + // numbers to validate linearization hint tables. Offsets and lengths of objects may imply
  1556 + // the end of an object to be anywhere between these values.
  1557 + qpdf_offset_t end_before_space = m->file->tell();
  1558 +
  1559 + // skip over spaces
  1560 + while (true) {
  1561 + char ch;
  1562 + if (!m->file->read(&ch, 1)) {
  1563 + throw damagedPDF(m->file->tell(), "EOF after endobj");
1531 } 1564 }
1532 - qpdf_offset_t end_after_space = m->file->tell();  
1533 - if (skip_cache_if_in_xref && m->xref_table.contains(og)) {  
1534 - // Ordinarily, an object gets read here when resolved through xref table or stream. In  
1535 - // the special case of the xref stream and linearization hint tables, the offset comes  
1536 - // from another source. For the specific case of xref streams, the xref stream is read  
1537 - // and loaded into the object cache very early in parsing. Ordinarily, when a file is  
1538 - // updated by appending, items inserted into the xref table in later updates take  
1539 - // precedence over earlier items. In the special case of reusing the object number  
1540 - // previously used as the xref stream, we have the following order of events:  
1541 - //  
1542 - // * reused object gets loaded into the xref table  
1543 - // * old object is read here while reading xref streams  
1544 - // * original xref entry is ignored (since already in xref table)  
1545 - //  
1546 - // It is the second step that causes a problem. Even though the xref table is correct in  
1547 - // this case, the old object is already in the cache and so effectively prevails over  
1548 - // the reused object. To work around this issue, we have a special case for the xref  
1549 - // stream (via the skip_cache_if_in_xref): if the object is already in the xref stream,  
1550 - // don't cache what we read here.  
1551 - //  
1552 - // It is likely that the same bug may exist for linearization hint tables, but the  
1553 - // existing code uses end_before_space and end_after_space from the cache, so fixing  
1554 - // that would require more significant rework. The chances of a linearization hint  
1555 - // stream being reused seems smaller because the xref stream is probably the highest  
1556 - // object in the file and the linearization hint stream would be some random place in  
1557 - // the middle, so I'm leaving that bug unfixed for now. If the bug were to be fixed, we  
1558 - // could use !check_og in place of skip_cache_if_in_xref.  
1559 - QTC::TC("qpdf", "QPDF skipping cache for known unchecked object");  
1560 - } else {  
1561 - updateCache(og, oh.getObj(), end_before_space, end_after_space); 1565 + if (!isspace(static_cast<unsigned char>(ch))) {
  1566 + m->file->seek(-1, SEEK_CUR);
  1567 + break;
1562 } 1568 }
1563 } 1569 }
  1570 + updateCache(og, oh.getObj(), end_before_space, m->file->tell());
1564 1571
1565 return oh; 1572 return oh;
1566 } 1573 }
@@ -1587,12 +1594,8 @@ QPDF::resolve(QPDFObjGen og) @@ -1587,12 +1594,8 @@ QPDF::resolve(QPDFObjGen og)
1587 try { 1594 try {
1588 switch (entry.getType()) { 1595 switch (entry.getType()) {
1589 case 1: 1596 case 1:
1590 - {  
1591 - qpdf_offset_t offset = entry.getOffset();  
1592 - // Object stored in cache by readObjectAtOffset  
1593 - QPDFObjGen a_og;  
1594 - QPDFObjectHandle oh = readObjectAtOffset(true, offset, "", og, a_og, false);  
1595 - } 1597 + // Object stored in cache by readObjectAtOffset
  1598 + readObjectAtOffset(true, entry.getOffset(), "", og);
1596 break; 1599 break;
1597 1600
1598 case 2: 1601 case 2: