Commit 72bd486337a35fb1b8bef744d1387edc93738682

Authored by m-holger
1 parent 3b97c9bd

Refactor QPDF::parse_xrefEntry

Move reading of the entry from read_xrefTable to parse_xrefEntry.

Split parse_xrefEntry into two new methods read_xrefEntry and
read_bad_xrefEntry. read_xrefEntry is optimised for reading
correct entries. To handle incorrect entries it calls read_bad_xrefEntry,
which is largely unchanged from parse_xrefEntry.
include/qpdf/QPDF.hh
... ... @@ -1004,7 +1004,8 @@ class QPDF
1004 1004 bool resolveXRefTable();
1005 1005 void reconstruct_xref(QPDFExc& e);
1006 1006 bool parse_xrefFirst(std::string const& line, int& obj, int& num, int& bytes);
1007   - bool parse_xrefEntry(std::string const& line, qpdf_offset_t& f1, int& f2, char& type);
  1007 + bool read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type);
  1008 + bool read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type);
1008 1009 qpdf_offset_t read_xrefTable(qpdf_offset_t offset);
1009 1010 qpdf_offset_t read_xrefStream(qpdf_offset_t offset);
1010 1011 qpdf_offset_t processXRefStream(qpdf_offset_t offset, QPDFObjectHandle& xref_stream);
... ...
libqpdf/QPDF.cc
... ... @@ -2,6 +2,7 @@
2 2  
3 3 #include <qpdf/QPDF.hh>
4 4  
  5 +#include <array>
5 6 #include <atomic>
6 7 #include <cstring>
7 8 #include <limits>
... ... @@ -767,11 +768,15 @@ QPDF::parse_xrefFirst(std::string const&amp; line, int&amp; obj, int&amp; num, int&amp; bytes)
767 768 }
768 769  
769 770 bool
770   -QPDF::parse_xrefEntry(std::string const& line, qpdf_offset_t& f1, int& f2, char& type)
  771 +QPDF::read_bad_xrefEntry(qpdf_offset_t& f1, int& f2, char& type)
771 772 {
  773 + // Reposition after initial read attempt and reread.
  774 + m->file->seek(m->file->getLastOffset(), SEEK_SET);
  775 + auto line = m->file->readLine(30);
  776 +
772 777 // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
773 778 // buffer.
774   - char const* p = line.c_str();
  779 + char const* p = line.data();
775 780  
776 781 // Skip zero or more spaces. There aren't supposed to be any.
777 782 bool invalid = false;
... ... @@ -842,18 +847,73 @@ QPDF::parse_xrefEntry(std::string const&amp; line, qpdf_offset_t&amp; f1, int&amp; f2, char&amp;
842 847 return true;
843 848 }
844 849  
  850 +// Optimistically read and parse xref entry. If entry is bad, call read_bad_xrefEntry and return
  851 +// result.
  852 +bool
  853 +QPDF::read_xrefEntry(qpdf_offset_t& f1, int& f2, char& type)
  854 +{
  855 + std::array<char, 21> line;
  856 + if (m->file->read(line.data(), 20) != 20) {
  857 + // C++20: [[unlikely]]
  858 + return false;
  859 + }
  860 + line[20] = '\0';
  861 + char const* p = line.data();
  862 +
  863 + int f1_len = 0;
  864 + int f2_len = 0;
  865 +
  866 + // is_space and is_digit both return false on '\0', so this will not overrun the null-terminated
  867 + // buffer.
  868 +
  869 + // Gather f1 digits. NB No risk of overflow as 9'999'999'999 < max long long.
  870 + while (*p == '0') {
  871 + ++f1_len;
  872 + ++p;
  873 + }
  874 + while (QUtil::is_digit(*p) && f1_len++ < 10) {
  875 + f1 *= 10;
  876 + f1 += *p++ - '0';
  877 + }
  878 + // Require space
  879 + if (!QUtil::is_space(*p++)) {
  880 + // Entry doesn't start with space or digit.
  881 + // C++20: [[unlikely]]
  882 + return false;
  883 + }
  884 + // Gather digits. NB No risk of overflow as 99'999 < max int.
  885 + while (*p == '0') {
  886 + ++f2_len;
  887 + ++p;
  888 + }
  889 + while (QUtil::is_digit(*p) && f2_len++ < 5) {
  890 + f2 *= 10;
  891 + f2 += static_cast<int>(*p++ - '0');
  892 + }
  893 + if (QUtil::is_space(*p++) && (*p == 'f' || *p == 'n')) {
  894 + // C++20: [[likely]]
  895 + type = *p;
  896 + ++p;
  897 + ++p; // No test for valid line[19].
  898 + if ((*p == '\n' || *p == '\r') && f1_len == 10 && f2_len == 5) {
  899 + // C++20: [[likely]]
  900 + return true;
  901 + }
  902 + }
  903 + return read_bad_xrefEntry(f1, f2, type);
  904 +}
  905 +
  906 +// Read a single cross-reference table section and associated trailer.
845 907 qpdf_offset_t
846 908 QPDF::read_xrefTable(qpdf_offset_t xref_offset)
847 909 {
848 910 std::vector<QPDFObjGen> deleted_items;
849 911  
850 912 m->file->seek(xref_offset, SEEK_SET);
851   - bool done = false;
852   - while (!done) {
853   - char linebuf[51];
854   - memset(linebuf, 0, sizeof(linebuf));
855   - m->file->read(linebuf, sizeof(linebuf) - 1);
856   - std::string line = linebuf;
  913 + std::string line;
  914 + while (true) {
  915 + line.assign(50, '\0');
  916 + m->file->read(line.data(), line.size());
857 917 int obj = 0;
858 918 int num = 0;
859 919 int bytes = 0;
... ... @@ -867,12 +927,11 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset)
867 927 // This is needed by checkLinearization()
868 928 m->first_xref_item_offset = m->file->tell();
869 929 }
870   - std::string xref_entry = m->file->readLine(30);
871 930 // For xref_table, these will always be small enough to be ints
872 931 qpdf_offset_t f1 = 0;
873 932 int f2 = 0;
874 933 char type = '\0';
875   - if (!parse_xrefEntry(xref_entry, f1, f2, type)) {
  934 + if (!read_xrefEntry(f1, f2, type)) {
876 935 QTC::TC("qpdf", "QPDF invalid xref entry");
877 936 throw damagedPDF(
878 937 "xref table", "invalid xref entry (obj=" + std::to_string(i) + ")");
... ... @@ -886,7 +945,7 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset)
886 945 }
887 946 qpdf_offset_t pos = m->file->tell();
888 947 if (readToken(m->file).isWord("trailer")) {
889   - done = true;
  948 + break;
890 949 } else {
891 950 m->file->seek(pos, SEEK_SET);
892 951 }
... ... @@ -945,6 +1004,7 @@ QPDF::read_xrefTable(qpdf_offset_t xref_offset)
945 1004 return xref_offset;
946 1005 }
947 1006  
  1007 +// Read a single cross-reference stream.
948 1008 qpdf_offset_t
949 1009 QPDF::read_xrefStream(qpdf_offset_t xref_offset)
950 1010 {
... ...