Commit aeb892f99bad9f6c24aef94a2d93d573c6de0382

Authored by Jay Berkenbilt
1 parent c551b972

accept stream keyword with CR only

git-svn-id: svn+q:///qpdf/trunk@1052 71b93d88-0707-0410-a8cf-f5a4172ac649
ChangeLog
... ... @@ -2,6 +2,11 @@
2 2  
3 3 * 2.2.3: release
4 4  
  5 + * libqpdf/QPDF.cc (readObjectInternal): Accept the case of the
  6 + stream keyword being followed by carriage return by itself. While
  7 + this is not permitted by the specification, there are PDF files
  8 + that do this, and other readers can read them.
  9 +
5 10 * libqpdf/Pl_QPDFTokenizer.cc (processChar): When an inline image
6 11 is detected, suspend normalization only up to the end of the
7 12 inline image rather than for the remainder of the content stream.
... ...
libqpdf/QPDF.cc
... ... @@ -1331,24 +1331,66 @@ QPDF::readObjectInternal(PointerHolder<InputSource> input,
1331 1331 if (readToken(input) ==
1332 1332 QPDFTokenizer::Token(QPDFTokenizer::tt_word, "stream"))
1333 1333 {
1334   - // Kill to next actual newline. Do not use readLine()
1335   - // here -- streams are a special case. The next
1336   - // single newline character marks the end of the
1337   - // stream token. It is incorrect to strip subsequent
1338   - // carriage returns or newlines as they may be part of
1339   - // the stream.
  1334 + // The PDF specification states that the word "stream"
  1335 + // should be followed by either a carriage return and
  1336 + // a newline or by a newline alone. It specifically
  1337 + // disallowed following it by a carriage return alone
  1338 + // since, in that case, there would be no way to tell
  1339 + // whether the NL in a CR NL sequence was part of the
  1340 + // stream data. However, some readers, including
  1341 + // Adobe reader, accept a carriage return by itself
  1342 + // when followed by a non-newline character, so that's
  1343 + // what we do here.
1340 1344 {
1341 1345 char ch;
1342   - do
  1346 + if (input->read(&ch, 1) == 0)
1343 1347 {
1344   - if (input->read(&ch, 1) == 0)
  1348 + // A premature EOF here will result in some
  1349 + // other problem that will get reported at
  1350 + // another time.
  1351 + }
  1352 + else if (ch == '\n')
  1353 + {
  1354 + // ready to read stream data
  1355 + QTC::TC("qpdf", "QPDF stream with NL only");
  1356 + }
  1357 + else if (ch == '\r')
  1358 + {
  1359 + // Read another character
  1360 + if (input->read(&ch, 1) != 0)
1345 1361 {
1346   - // A premature EOF here will result in
1347   - // some other problem that will get
1348   - // reported at another time.
1349   - ch = '\n';
  1362 + if (ch == '\n')
  1363 + {
  1364 + // Ready to read stream data
  1365 + QTC::TC("qpdf", "QPDF stream with CRNL");
  1366 + }
  1367 + else
  1368 + {
  1369 + // Treat the \r by itself as the
  1370 + // whitespace after endstream and
  1371 + // start reading stream data in spite
  1372 + // of not having seen a newline.
  1373 + QTC::TC("qpdf", "QPDF stream with CR only");
  1374 + input->unreadCh(ch);
  1375 + warn(QPDFExc(
  1376 + qpdf_e_damaged_pdf,
  1377 + input->getName(),
  1378 + this->last_object_description,
  1379 + input->tell(),
  1380 + "stream keyword followed"
  1381 + " by carriage return only"));
  1382 + }
1350 1383 }
1351   - } while (ch != '\n');
  1384 + }
  1385 + else
  1386 + {
  1387 + QTC::TC("qpdf", "QPDF stream without newline");
  1388 + warn(QPDFExc(qpdf_e_damaged_pdf, input->getName(),
  1389 + this->last_object_description,
  1390 + input->tell(),
  1391 + "stream keyword not followed"
  1392 + " by proper line terminator"));
  1393 + }
1352 1394 }
1353 1395  
1354 1396 // Must get offset before accessing any additional
... ...
manual/qpdf-manual.xml
... ... @@ -2080,6 +2080,12 @@ print "\n";
2080 2080 <itemizedlist>
2081 2081 <listitem>
2082 2082 <para>
  2083 + Handle some damaged streams with incorrect characters
  2084 + following the stream keyword.
  2085 + </para>
  2086 + </listitem>
  2087 + <listitem>
  2088 + <para>
2083 2089 Improve handling of inline images when normalizing content
2084 2090 streams.
2085 2091 </para>
... ...
qpdf/qpdf.testcov
... ... @@ -188,3 +188,7 @@ QPDF_Stream getStreamData 0
188 188 QPDF_Stream expand filter abbreviation 0
189 189 qpdf-c called qpdf_read_memory 0
190 190 Pl_QPDFTokenizer found EI 0
  191 +QPDF stream without newline 0
  192 +QPDF stream with CR only 0
  193 +QPDF stream with CRNL 0
  194 +QPDF stream with NL only 0
... ...
qpdf/qtest/qpdf.test
... ... @@ -111,7 +111,7 @@ $td-&gt;runtest(&quot;new stream&quot;,
111 111 show_ntests();
112 112 # ----------
113 113 $td->notify("--- Miscellaneous Tests ---");
114   -$n_tests += 29;
  114 +$n_tests += 31;
115 115  
116 116 $td->runtest("qpdf version",
117 117 {$td->COMMAND => "qpdf --version"},
... ... @@ -265,6 +265,17 @@ $td-&gt;runtest(&quot;error/output redirection to strings&quot;,
265 265 $td->EXIT_STATUS => 0},
266 266 $td->NORMALIZE_NEWLINES);
267 267  
  268 +$td->runtest("odd terminators for stream keyword",
  269 + {$td->COMMAND =>
  270 + "qpdf --qdf --static-id" .
  271 + " stream-line-enders.pdf a.qdf"},
  272 + {$td->FILE => "stream-line-enders.out",
  273 + $td->EXIT_STATUS => 3},
  274 + $td->NORMALIZE_NEWLINES);
  275 +$td->runtest("check output",
  276 + {$td->FILE => "a.qdf"},
  277 + {$td->FILE => "stream-line-enders.qdf"});
  278 +
268 279 show_ntests();
269 280 # ----------
270 281 $td->notify("--- Error Condition Tests ---");
... ...
qpdf/qtest/qpdf/stream-line-enders.out 0 โ†’ 100644
  1 +WARNING: stream-line-enders.pdf (object 5 0, file position 378): stream keyword followed by carriage return only
  2 +WARNING: stream-line-enders.pdf (object 6 0, file position 437): stream keyword not followed by proper line terminator
  3 +qpdf: operation succeeded with warnings; resulting file may have some problems
... ...
qpdf/qtest/qpdf/stream-line-enders.pdf 0 โ†’ 100644
  1 +%PDF-1.3
  2 +%ยฟรทยขรพ
  3 +1 0 obj
  4 +<< /Pages 2 0 R /Type /Catalog >>
  5 +endobj
  6 +2 0 obj
  7 +<< /Count 1 /Kids [ 3 0 R ] /Type /Pages >>
  8 +endobj
  9 +3 0 obj
  10 +<< /Contents [ 4 0 R 5 0 R 6 0 R ] /MediaBox [ 0 0 612 792 ] /Parent 2 0 R /Resources << /Font << /F1 7 0 R >> /ProcSet 8 0 R >> /Type /Page >>
  11 +endobj
  12 +4 0 obj
  13 +<< /Length 14 >>
  14 +stream
  15 +BT
  16 + /F1 24 Tf
  17 +endstream
  18 +endobj
  19 +5 0 obj
  20 +<< /Length 10 >>
  21 +stream 72 720 Td
  22 +endstream
  23 +endobj
  24 +6 0 obj
  25 +<< /Length 15 >>
  26 +stream (Potato) Tj
  27 +ET
  28 +endstream
  29 +endobj
  30 +7 0 obj
  31 +<< /BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font >>
  32 +endobj
  33 +8 0 obj
  34 +[ /PDF /Text ]
  35 +endobj
  36 +xref
  37 +0 9
  38 +0000000000 65535 f
  39 +0000000015 00000 n
  40 +0000000064 00000 n
  41 +0000000123 00000 n
  42 +0000000282 00000 n
  43 +0000000346 00000 n
  44 +0000000405 00000 n
  45 +0000000469 00000 n
  46 +0000000576 00000 n
  47 +trailer << /Root 1 0 R /Size 9 /ID [<08aa98c73f8a7262d77c8328772c3989><7b1f32865e2165debe277f27ee790092>] >>
  48 +startxref
  49 +606
  50 +%%EOF
... ...
qpdf/qtest/qpdf/stream-line-enders.qdf 0 โ†’ 100644
  1 +%PDF-1.3
  2 +%ยฟรทยขรพ
  3 +%QDF-1.0
  4 +
  5 +%% Original object ID: 1 0
  6 +1 0 obj
  7 +<<
  8 + /Pages 2 0 R
  9 + /Type /Catalog
  10 +>>
  11 +endobj
  12 +
  13 +%% Original object ID: 2 0
  14 +2 0 obj
  15 +<<
  16 + /Count 1
  17 + /Kids [
  18 + 3 0 R
  19 + ]
  20 + /Type /Pages
  21 +>>
  22 +endobj
  23 +
  24 +%% Page 1
  25 +%% Original object ID: 3 0
  26 +3 0 obj
  27 +<<
  28 + /Contents [
  29 + 4 0 R
  30 + 6 0 R
  31 + 8 0 R
  32 + ]
  33 + /MediaBox [
  34 + 0
  35 + 0
  36 + 612
  37 + 792
  38 + ]
  39 + /Parent 2 0 R
  40 + /Resources <<
  41 + /Font <<
  42 + /F1 10 0 R
  43 + >>
  44 + /ProcSet 11 0 R
  45 + >>
  46 + /Type /Page
  47 +>>
  48 +endobj
  49 +
  50 +%% Contents for page 1
  51 +%% Original object ID: 4 0
  52 +4 0 obj
  53 +<<
  54 + /Length 5 0 R
  55 +>>
  56 +stream
  57 +BT
  58 + /F1 24 Tf
  59 +endstream
  60 +endobj
  61 +
  62 +5 0 obj
  63 +14
  64 +endobj
  65 +
  66 +%% Contents for page 1
  67 +%% Original object ID: 5 0
  68 +6 0 obj
  69 +<<
  70 + /Length 7 0 R
  71 +>>
  72 +stream
  73 +72 720 Td
  74 +endstream
  75 +endobj
  76 +
  77 +7 0 obj
  78 +10
  79 +endobj
  80 +
  81 +%% Contents for page 1
  82 +%% Original object ID: 6 0
  83 +8 0 obj
  84 +<<
  85 + /Length 9 0 R
  86 +>>
  87 +stream
  88 +(Potato) Tj
  89 +ET
  90 +endstream
  91 +endobj
  92 +
  93 +9 0 obj
  94 +15
  95 +endobj
  96 +
  97 +%% Original object ID: 7 0
  98 +10 0 obj
  99 +<<
  100 + /BaseFont /Helvetica
  101 + /Encoding /WinAnsiEncoding
  102 + /Name /F1
  103 + /Subtype /Type1
  104 + /Type /Font
  105 +>>
  106 +endobj
  107 +
  108 +%% Original object ID: 8 0
  109 +11 0 obj
  110 +[
  111 + /PDF
  112 + /Text
  113 +]
  114 +endobj
  115 +
  116 +xref
  117 +0 12
  118 +0000000000 65535 f
  119 +0000000052 00000 n
  120 +0000000133 00000 n
  121 +0000000242 00000 n
  122 +0000000516 00000 n
  123 +0000000585 00000 n
  124 +0000000654 00000 n
  125 +0000000719 00000 n
  126 +0000000788 00000 n
  127 +0000000858 00000 n
  128 +0000000904 00000 n
  129 +0000001050 00000 n
  130 +trailer <<
  131 + /Root 1 0 R
  132 + /Size 12
  133 + /ID [<08aa98c73f8a7262d77c8328772c3989><31415926535897932384626433832795>]
  134 +>>
  135 +startxref
  136 +1086
  137 +%%EOF
... ...