Commit bb9e91adbd75d05d0d60227b2d419d7ee12e1b42

Authored by Jay Berkenbilt
1 parent ebd5ed63

Create isolated tokenizer tests

This tokenizes outer parts of the file, page content streams, and
object streams. It is for exercising the tokenizer in isolation and is
being introduced before reworking the lexical layer of qpdf.
qpdf/build.mk
1 -BINS_qpdf = qpdf test_driver pdf_from_scratch test_large_file 1 +BINS_qpdf = qpdf test_driver pdf_from_scratch test_large_file test_tokenizer
2 CBINS_qpdf = qpdf-ctest 2 CBINS_qpdf = qpdf-ctest
3 3
4 TARGETS_qpdf = $(foreach B,$(BINS_qpdf) $(CBINS_qpdf),qpdf/$(OUTPUT_DIR)/$(call binname,$(B))) 4 TARGETS_qpdf = $(foreach B,$(BINS_qpdf) $(CBINS_qpdf),qpdf/$(OUTPUT_DIR)/$(call binname,$(B)))
qpdf/qtest/qpdf.test
@@ -240,7 +240,7 @@ foreach my $d (@bug_tests) @@ -240,7 +240,7 @@ foreach my $d (@bug_tests)
240 show_ntests(); 240 show_ntests();
241 # ---------- 241 # ----------
242 $td->notify("--- Miscellaneous Tests ---"); 242 $td->notify("--- Miscellaneous Tests ---");
243 -$n_tests += 96; 243 +$n_tests += 97;
244 244
245 $td->runtest("qpdf version", 245 $td->runtest("qpdf version",
246 {$td->COMMAND => "qpdf --version"}, 246 {$td->COMMAND => "qpdf --version"},
@@ -263,6 +263,11 @@ $td->runtest("check pass1 file", @@ -263,6 +263,11 @@ $td->runtest("check pass1 file",
263 {$td->FILE => "b.pdf"}, 263 {$td->FILE => "b.pdf"},
264 {$td->FILE => "minimal-linearize-pass1.pdf"}); 264 {$td->FILE => "minimal-linearize-pass1.pdf"});
265 265
  266 +$td->runtest("tokenizer",
  267 + {$td->COMMAND => "test_tokenizer tokens.pdf"},
  268 + {$td->FILE => "tokens.out", $td->EXIT_STATUS => 0},
  269 + $td->NORMALIZE_NEWLINES);
  270 +
266 foreach (my $i = 1; $i <= 3; ++$i) 271 foreach (my $i = 1; $i <= 3; ++$i)
267 { 272 {
268 $td->runtest("misc tests", 273 $td->runtest("misc tests",
qpdf/qtest/qpdf/tokens.out 0 → 100644
  1 +--- BEGIN FILE ---
  2 +60: integer: 1
  3 +62: integer: 0
  4 +64: word: obj
  5 +68: dict_open: <<
  6 +73: name: /Type
  7 +79: name: /ObjStm
  8 +89: name: /Length
  9 +97: integer: 6020
  10 +104: name: /N
  11 +107: integer: 35
  12 +112: name: /First
  13 +119: integer: 323
  14 +123: dict_close: >>
  15 +126: word: stream
  16 +skipping to endstream
  17 +6153: word: endstream
  18 +6163: word: endobj
  19 +6222: integer: 37
  20 +6225: integer: 0
  21 +6227: word: obj
  22 +6231: dict_open: <<
  23 +6236: name: /Length
  24 +6244: integer: 38
  25 +6247: integer: 0
  26 +6249: word: R
  27 +6251: dict_close: >>
  28 +6254: word: stream
  29 +skipping to endstream
  30 +6305: word: endstream
  31 +6315: word: endobj
  32 +6323: integer: 38
  33 +6326: integer: 0
  34 +6328: word: obj
  35 +6332: integer: 44
  36 +6335: word: endobj
  37 +6394: integer: 39
  38 +6397: integer: 0
  39 +6399: word: obj
  40 +6403: dict_open: <<
  41 +6408: name: /Length
  42 +6416: integer: 40
  43 +6419: integer: 0
  44 +6421: word: R
  45 +6423: dict_close: >>
  46 +6426: word: stream
  47 +skipping to endstream
  48 +6832: word: endstream
  49 +6842: word: endobj
  50 +6850: integer: 40
  51 +6853: integer: 0
  52 +6855: word: obj
  53 +6859: integer: 399
  54 +6863: word: endobj
  55 +6922: integer: 41
  56 +6925: integer: 0
  57 +6927: word: obj
  58 +6931: dict_open: <<
  59 +6936: name: /Length
  60 +6944: integer: 42
  61 +6947: integer: 0
  62 +6949: word: R
  63 +6951: dict_close: >>
  64 +6954: word: stream
  65 +skipping to endstream
  66 +7001: word: endstream
  67 +7011: word: endobj
  68 +7019: integer: 42
  69 +7022: integer: 0
  70 +7024: word: obj
  71 +7028: integer: 40
  72 +7031: word: endobj
  73 +7090: integer: 43
  74 +7093: integer: 0
  75 +7095: word: obj
  76 +7099: dict_open: <<
  77 +7104: name: /Length
  78 +7112: integer: 44
  79 +7115: integer: 0
  80 +7117: word: R
  81 +7119: dict_close: >>
  82 +7122: word: stream
  83 +skipping to endstream
  84 +7404: word: endstream
  85 +7414: word: endobj
  86 +7422: integer: 44
  87 +7425: integer: 0
  88 +7427: word: obj
  89 +7431: integer: 275
  90 +7435: word: endobj
  91 +7494: integer: 45
  92 +7497: integer: 0
  93 +7499: word: obj
  94 +7503: dict_open: <<
  95 +7508: name: /Length
  96 +7516: integer: 46
  97 +7519: integer: 0
  98 +7521: word: R
  99 +7523: dict_close: >>
  100 +7526: word: stream
  101 +skipping to endstream
  102 +7601: word: endstream
  103 +7611: word: endobj
  104 +7619: integer: 46
  105 +7622: integer: 0
  106 +7624: word: obj
  107 +7628: integer: 68
  108 +7631: word: endobj
  109 +7690: integer: 47
  110 +7693: integer: 0
  111 +7695: word: obj
  112 +7699: dict_open: <<
  113 +7704: name: /Length
  114 +7712: integer: 48
  115 +7715: integer: 0
  116 +7717: word: R
  117 +7719: dict_close: >>
  118 +7722: word: stream
  119 +skipping to endstream
  120 +7773: word: endstream
  121 +7783: word: endobj
  122 +7791: integer: 48
  123 +7794: integer: 0
  124 +7796: word: obj
  125 +7800: integer: 44
  126 +7803: word: endobj
  127 +7862: integer: 49
  128 +7865: integer: 0
  129 +7867: word: obj
  130 +7871: dict_open: <<
  131 +7876: name: /Length
  132 +7884: integer: 50
  133 +7887: integer: 0
  134 +7889: word: R
  135 +7891: dict_close: >>
  136 +7894: word: stream
  137 +skipping to endstream
  138 +7945: word: endstream
  139 +7955: word: endobj
  140 +7963: integer: 50
  141 +7966: integer: 0
  142 +7968: word: obj
  143 +7972: integer: 44
  144 +7975: word: endobj
  145 +8034: integer: 51
  146 +8037: integer: 0
  147 +8039: word: obj
  148 +8043: dict_open: <<
  149 +8048: name: /Length
  150 +8056: integer: 52
  151 +8059: integer: 0
  152 +8061: word: R
  153 +8063: dict_close: >>
  154 +8066: word: stream
  155 +skipping to endstream
  156 +8117: word: endstream
  157 +8127: word: endobj
  158 +8135: integer: 52
  159 +8138: integer: 0
  160 +8140: word: obj
  161 +8144: integer: 44
  162 +8147: word: endobj
  163 +8206: integer: 53
  164 +8209: integer: 0
  165 +8211: word: obj
  166 +8215: dict_open: <<
  167 +8220: name: /Length
  168 +8228: integer: 54
  169 +8231: integer: 0
  170 +8233: word: R
  171 +8235: dict_close: >>
  172 +8238: word: stream
  173 +skipping to endstream
  174 +8289: word: endstream
  175 +8299: word: endobj
  176 +8307: integer: 54
  177 +8310: integer: 0
  178 +8312: word: obj
  179 +8316: integer: 44
  180 +8319: word: endobj
  181 +8379: integer: 55
  182 +8382: integer: 0
  183 +8384: word: obj
  184 +8388: dict_open: <<
  185 +8393: name: /Length
  186 +8401: integer: 56
  187 +8404: integer: 0
  188 +8406: word: R
  189 +8408: dict_close: >>
  190 +8411: word: stream
  191 +skipping to endstream
  192 +8462: word: endstream
  193 +8472: word: endobj
  194 +8480: integer: 56
  195 +8483: integer: 0
  196 +8485: word: obj
  197 +8489: integer: 44
  198 +8492: word: endobj
  199 +8552: integer: 57
  200 +8555: integer: 0
  201 +8557: word: obj
  202 +8561: dict_open: <<
  203 +8566: name: /Length
  204 +8574: integer: 58
  205 +8577: integer: 0
  206 +8579: word: R
  207 +8581: dict_close: >>
  208 +8584: word: stream
  209 +skipping to endstream
  210 +8635: word: endstream
  211 +8645: word: endobj
  212 +8653: integer: 58
  213 +8656: integer: 0
  214 +8658: word: obj
  215 +8662: integer: 44
  216 +8665: word: endobj
  217 +8673: integer: 59
  218 +8676: integer: 0
  219 +8678: word: obj
  220 +8682: dict_open: <<
  221 +8687: name: /Type
  222 +8693: name: /XRef
  223 +8701: name: /Length
  224 +8709: integer: 240
  225 +8715: name: /W
  226 +8718: array_open: [
  227 +8720: integer: 1
  228 +8722: integer: 2
  229 +8724: integer: 1
  230 +8726: array_close: ]
  231 +8730: name: /Root
  232 +8736: integer: 2
  233 +8738: integer: 0
  234 +8740: word: R
  235 +8744: name: /Size
  236 +8750: integer: 60
  237 +8755: name: /ID
  238 +8759: array_open: [
  239 +8760: string: \x88\x04\x8e\x17\xc9a\xe0\x94\xff\xec\xe9\x8c\xb8\x8cF\xd0 (raw: <88048e17c961e094ffece98cb88c46d0>)
  240 +8794: string: \xed\xd6\x0f\xe8\xee\x87\xf8\x871\xa8o\x81\x9f\xe6Q\x99 (raw: <edd60fe8ee87f88731a86f819fe65199>)
  241 +8828: array_close: ]
  242 +8830: dict_close: >>
  243 +8833: word: stream
  244 +skipping to endstream
  245 +9081: word: endstream
  246 +9091: word: endobj
  247 +9099: word: startxref
  248 +9109: integer: 8673
  249 +9120: eof
  250 +--- END FILE ---
  251 +--- BEGIN PAGE 1 ---
  252 +0: word: BT
  253 +5: name: /F1
  254 +9: integer: 24
  255 +12: word: Tf
  256 +17: integer: 72
  257 +20: integer: 720
  258 +24: word: Td
  259 +29: string: Potato (raw: (Potato))
  260 +38: word: Tj
  261 +41: word: ET
  262 +44: eof
  263 +--- END PAGE 1 ---
  264 +--- BEGIN PAGE 2 ---
  265 +0: word: BT
  266 +5: name: /F1
  267 +9: integer: 24
  268 +12: word: Tf
  269 +17: integer: 72
  270 +20: integer: 720
  271 +24: word: Td
  272 +29: string: Potato (raw: (Potato))
  273 +38: word: Tj
  274 +41: word: ET
  275 +44: word: BI
  276 +47: name: /CS
  277 +51: name: /G
  278 +53: name: /W
  279 +56: integer: 66
  280 +58: name: /H
  281 +61: integer: 47
  282 +63: name: /BPC
  283 +68: integer: 8
  284 +69: name: /F
  285 +71: name: /Fl
  286 +74: name: /DP
  287 +77: dict_open: <<
  288 +79: name: /Predictor
  289 +90: integer: 15
  290 +92: name: /Columns
  291 +101: integer: 66
  292 +103: dict_close: >>
  293 +106: word: ID
  294 +skipping to EI
  295 +352: word: EI
  296 +355: word: BT
  297 +360: name: /F1
  298 +364: integer: 24
  299 +367: word: Tf
  300 +372: integer: 72
  301 +375: integer: 720
  302 +379: word: Td
  303 +384: string: Potato (raw: (Potato))
  304 +393: word: Tj
  305 +396: word: ET
  306 +399: eof
  307 +--- END PAGE 2 ---
  308 +--- BEGIN PAGE 3 ---
  309 +0: word: BT
  310 +5: name: /F1
  311 +9: integer: 24
  312 +12: word: Tf
  313 +17: integer: 72
  314 +20: integer: 720
  315 +24: word: Td
  316 +29: bad: Potato\x0aET\x0a (raw: (Potato\x0aET\x0a) (EOF while reading token)
  317 +40: eof
  318 +--- END PAGE 3 ---
  319 +--- BEGIN PAGE 4 ---
  320 +0: word: BT
  321 +5: name: /F1
  322 +9: integer: 24
  323 +12: word: Tf
  324 +17: string: \xfe\xeb (raw: <feeb>)
  325 +26: string: \xab\xcd (raw: <ab\x0aCD>)
  326 +36: string: quack (raw: (qu\\x0d\x0aack))
  327 +49: string: quack (raw: (qu\\x0aack))
  328 +61: string: quack (raw: (qu\\x0dack))
  329 +73: integer: 72
  330 +76: integer: 720
  331 +80: word: Td
  332 +85: real: 3.14
  333 +92: real: 3.
  334 +97: real: .14
  335 +103: real: +3.14
  336 +111: real: +3.
  337 +117: real: +.14
  338 +124: real: -3.14
  339 +132: real: -3.
  340 +138: real: -.14
  341 +145: integer: +16059
  342 +154: integer: -16059
  343 +163: word: +.
  344 +168: bad: fadeE (raw: <fade\x0aET) (invalid character (T) in hexstring)
  345 +177: bad: ) (unexpected ))
  346 +178: bad: > (unexpected >)
  347 +179: word: quack
  348 +185: bad: /name#oops (invalid name token)
  349 +196: name: /name (raw: /n#61me)
  350 +204: word: one
  351 +208: bool: true
  352 +213: word: two
  353 +217: bool: false
  354 +223: word: three
  355 +229: null: null
  356 +234: word: four
  357 +239: word: !@#$^&
  358 +245: brace_open: {
  359 +246: brace_close: }
  360 +247: word: *-_+=
  361 +253: word: abc123def3.14true
  362 +271: bad: ff (raw: <ff\x0a) (EOF while reading token)
  363 +275: eof
  364 +--- END PAGE 4 ---
  365 +--- BEGIN PAGE 5 ---
  366 +0: word: BT
  367 +5: bad: /F#00x (null character not allowed in name token)
  368 +12: integer: 24
  369 +15: word: Tf
  370 +20: integer: 72
  371 +23: integer: 720
  372 +27: word: Td
  373 +32: string: P\x00tat\x00 (raw: (P\x00tat\000))
  374 +44: word: Tj
  375 +47: word: ET
  376 +52: name: /ThisMustBeLast
  377 +68: eof
  378 +--- END PAGE 5 ---
  379 +--- BEGIN PAGE 6 ---
  380 +0: word: ID
  381 +skipping to EI
  382 +EI not found
  383 +5: name: /F1
  384 +9: integer: 24
  385 +12: word: Tf
  386 +17: integer: 72
  387 +20: integer: 720
  388 +24: word: Td
  389 +29: string: Potato (raw: (Potato))
  390 +38: word: Tj
  391 +41: word: ET
  392 +44: eof
  393 +--- END PAGE 6 ---
  394 +--- BEGIN PAGE 7 ---
  395 +0: word: BT
  396 +5: name: /F1
  397 +9: integer: 24
  398 +12: word: Tf
  399 +17: integer: 72
  400 +20: integer: 720
  401 +24: word: Td
  402 +29: string: Potato (raw: (Potato))
  403 +38: word: Tj
  404 +41: word: ET
  405 +44: eof
  406 +--- END PAGE 7 ---
  407 +--- BEGIN PAGE 8 ---
  408 +0: word: BT
  409 +5: name: /F1
  410 +9: integer: 24
  411 +12: word: Tf
  412 +17: integer: 72
  413 +20: integer: 720
  414 +24: word: Td
  415 +29: string: Potato (raw: (Potato))
  416 +38: word: Tj
  417 +41: word: ET
  418 +44: eof
  419 +--- END PAGE 8 ---
  420 +--- BEGIN PAGE 9 ---
  421 +0: word: BT
  422 +5: name: /F1
  423 +9: integer: 24
  424 +12: word: Tf
  425 +17: integer: 72
  426 +20: integer: 720
  427 +24: word: Td
  428 +29: string: Potato (raw: (Potato))
  429 +38: word: Tj
  430 +41: word: ET
  431 +44: eof
  432 +--- END PAGE 9 ---
  433 +--- BEGIN PAGE 10 ---
  434 +0: word: BT
  435 +5: name: /F1
  436 +9: integer: 24
  437 +12: word: Tf
  438 +17: integer: 72
  439 +20: integer: 720
  440 +24: word: Td
  441 +29: string: Potato (raw: (Potato))
  442 +38: word: Tj
  443 +41: word: ET
  444 +44: eof
  445 +--- END PAGE 10 ---
  446 +--- BEGIN PAGE 11 ---
  447 +0: word: BT
  448 +5: name: /F1
  449 +9: integer: 24
  450 +12: word: Tf
  451 +17: integer: 72
  452 +20: integer: 720
  453 +24: word: Td
  454 +29: string: Potato (raw: (Potato))
  455 +38: word: Tj
  456 +41: word: ET
  457 +44: eof
  458 +--- END PAGE 11 ---
  459 +--- BEGIN OBJECT STREAM 1 ---
  460 +0: integer: 2
  461 +2: integer: 0
  462 +4: integer: 3
  463 +6: integer: 97
  464 +9: integer: 4
  465 +11: integer: 318
  466 +15: integer: 5
  467 +17: integer: 566
  468 +21: integer: 6
  469 +23: integer: 814
  470 +27: integer: 7
  471 +29: integer: 1062
  472 +34: integer: 8
  473 +36: integer: 1310
  474 +41: integer: 9
  475 +43: integer: 1558
  476 +48: integer: 10
  477 +51: integer: 1808
  478 +56: integer: 11
  479 +59: integer: 2058
  480 +64: integer: 12
  481 +67: integer: 2309
  482 +72: integer: 13
  483 +75: integer: 2560
  484 +80: integer: 14
  485 +83: integer: 2812
  486 +88: integer: 15
  487 +91: integer: 3064
  488 +96: integer: 16
  489 +99: integer: 3228
  490 +104: integer: 17
  491 +107: integer: 3309
  492 +112: integer: 18
  493 +115: integer: 3473
  494 +120: integer: 19
  495 +123: integer: 3554
  496 +128: integer: 20
  497 +131: integer: 3718
  498 +136: integer: 21
  499 +139: integer: 3799
  500 +144: integer: 22
  501 +147: integer: 3963
  502 +152: integer: 23
  503 +155: integer: 4044
  504 +160: integer: 24
  505 +163: integer: 4208
  506 +168: integer: 25
  507 +171: integer: 4289
  508 +176: integer: 26
  509 +179: integer: 4453
  510 +184: integer: 27
  511 +187: integer: 4534
  512 +192: integer: 28
  513 +195: integer: 4698
  514 +200: integer: 29
  515 +203: integer: 4779
  516 +208: integer: 30
  517 +211: integer: 4943
  518 +216: integer: 31
  519 +219: integer: 5024
  520 +224: integer: 32
  521 +227: integer: 5188
  522 +232: integer: 33
  523 +235: integer: 5269
  524 +240: integer: 34
  525 +243: integer: 5433
  526 +248: integer: 35
  527 +251: integer: 5514
  528 +256: integer: 36
  529 +259: integer: 5678
  530 +323: dict_open: <<
  531 +328: name: /Pages
  532 +335: integer: 3
  533 +337: integer: 0
  534 +339: word: R
  535 +343: name: /Type
  536 +349: name: /Catalog
  537 +358: dict_close: >>
  538 +420: dict_open: <<
  539 +425: name: /Count
  540 +432: integer: 11
  541 +437: name: /Kids
  542 +443: array_open: [
  543 +449: integer: 4
  544 +451: integer: 0
  545 +453: word: R
  546 +459: integer: 5
  547 +461: integer: 0
  548 +463: word: R
  549 +469: integer: 6
  550 +471: integer: 0
  551 +473: word: R
  552 +479: integer: 7
  553 +481: integer: 0
  554 +483: word: R
  555 +489: integer: 8
  556 +491: integer: 0
  557 +493: word: R
  558 +499: integer: 9
  559 +501: integer: 0
  560 +503: word: R
  561 +509: integer: 10
  562 +512: integer: 0
  563 +514: word: R
  564 +520: integer: 11
  565 +523: integer: 0
  566 +525: word: R
  567 +531: integer: 12
  568 +534: integer: 0
  569 +536: word: R
  570 +542: integer: 13
  571 +545: integer: 0
  572 +547: word: R
  573 +553: integer: 14
  574 +556: integer: 0
  575 +558: word: R
  576 +562: array_close: ]
  577 +566: name: /Type
  578 +572: name: /Pages
  579 +579: dict_close: >>
  580 +651: dict_open: <<
  581 +656: name: /Contents
  582 +666: integer: 37
  583 +669: integer: 0
  584 +671: word: R
  585 +675: name: /MediaBox
  586 +685: array_open: [
  587 +691: integer: 0
  588 +697: integer: 0
  589 +703: integer: 612
  590 +711: integer: 792
  591 +717: array_close: ]
  592 +721: name: /Parent
  593 +729: integer: 3
  594 +731: integer: 0
  595 +733: word: R
  596 +737: name: /Resources
  597 +748: dict_open: <<
  598 +755: name: /Font
  599 +761: dict_open: <<
  600 +770: name: /F1
  601 +774: integer: 15
  602 +777: integer: 0
  603 +779: word: R
  604 +785: dict_close: >>
  605 +792: name: /ProcSet
  606 +801: integer: 16
  607 +804: integer: 0
  608 +806: word: R
  609 +810: dict_close: >>
  610 +815: name: /Type
  611 +821: name: /Page
  612 +827: dict_close: >>
  613 +899: dict_open: <<
  614 +904: name: /Contents
  615 +914: integer: 39
  616 +917: integer: 0
  617 +919: word: R
  618 +923: name: /MediaBox
  619 +933: array_open: [
  620 +939: integer: 0
  621 +945: integer: 0
  622 +951: integer: 612
  623 +959: integer: 792
  624 +965: array_close: ]
  625 +969: name: /Parent
  626 +977: integer: 3
  627 +979: integer: 0
  628 +981: word: R
  629 +985: name: /Resources
  630 +996: dict_open: <<
  631 +1003: name: /Font
  632 +1009: dict_open: <<
  633 +1018: name: /F1
  634 +1022: integer: 17
  635 +1025: integer: 0
  636 +1027: word: R
  637 +1033: dict_close: >>
  638 +1040: name: /ProcSet
  639 +1049: integer: 18
  640 +1052: integer: 0
  641 +1054: word: R
  642 +1058: dict_close: >>
  643 +1063: name: /Type
  644 +1069: name: /Page
  645 +1075: dict_close: >>
  646 +1147: dict_open: <<
  647 +1152: name: /Contents
  648 +1162: integer: 41
  649 +1165: integer: 0
  650 +1167: word: R
  651 +1171: name: /MediaBox
  652 +1181: array_open: [
  653 +1187: integer: 0
  654 +1193: integer: 0
  655 +1199: integer: 612
  656 +1207: integer: 792
  657 +1213: array_close: ]
  658 +1217: name: /Parent
  659 +1225: integer: 3
  660 +1227: integer: 0
  661 +1229: word: R
  662 +1233: name: /Resources
  663 +1244: dict_open: <<
  664 +1251: name: /Font
  665 +1257: dict_open: <<
  666 +1266: name: /F1
  667 +1270: integer: 19
  668 +1273: integer: 0
  669 +1275: word: R
  670 +1281: dict_close: >>
  671 +1288: name: /ProcSet
  672 +1297: integer: 20
  673 +1300: integer: 0
  674 +1302: word: R
  675 +1306: dict_close: >>
  676 +1311: name: /Type
  677 +1317: name: /Page
  678 +1323: dict_close: >>
  679 +1395: dict_open: <<
  680 +1400: name: /Contents
  681 +1410: integer: 43
  682 +1413: integer: 0
  683 +1415: word: R
  684 +1419: name: /MediaBox
  685 +1429: array_open: [
  686 +1435: integer: 0
  687 +1441: integer: 0
  688 +1447: integer: 612
  689 +1455: integer: 792
  690 +1461: array_close: ]
  691 +1465: name: /Parent
  692 +1473: integer: 3
  693 +1475: integer: 0
  694 +1477: word: R
  695 +1481: name: /Resources
  696 +1492: dict_open: <<
  697 +1499: name: /Font
  698 +1505: dict_open: <<
  699 +1514: name: /F1
  700 +1518: integer: 21
  701 +1521: integer: 0
  702 +1523: word: R
  703 +1529: dict_close: >>
  704 +1536: name: /ProcSet
  705 +1545: integer: 22
  706 +1548: integer: 0
  707 +1550: word: R
  708 +1554: dict_close: >>
  709 +1559: name: /Type
  710 +1565: name: /Page
  711 +1571: dict_close: >>
  712 +1643: dict_open: <<
  713 +1648: name: /Contents
  714 +1658: integer: 45
  715 +1661: integer: 0
  716 +1663: word: R
  717 +1667: name: /MediaBox
  718 +1677: array_open: [
  719 +1683: integer: 0
  720 +1689: integer: 0
  721 +1695: integer: 612
  722 +1703: integer: 792
  723 +1709: array_close: ]
  724 +1713: name: /Parent
  725 +1721: integer: 3
  726 +1723: integer: 0
  727 +1725: word: R
  728 +1729: name: /Resources
  729 +1740: dict_open: <<
  730 +1747: name: /Font
  731 +1753: dict_open: <<
  732 +1762: name: /F1
  733 +1766: integer: 23
  734 +1769: integer: 0
  735 +1771: word: R
  736 +1777: dict_close: >>
  737 +1784: name: /ProcSet
  738 +1793: integer: 24
  739 +1796: integer: 0
  740 +1798: word: R
  741 +1802: dict_close: >>
  742 +1807: name: /Type
  743 +1813: name: /Page
  744 +1819: dict_close: >>
  745 +1891: dict_open: <<
  746 +1896: name: /Contents
  747 +1906: integer: 47
  748 +1909: integer: 0
  749 +1911: word: R
  750 +1915: name: /MediaBox
  751 +1925: array_open: [
  752 +1931: integer: 0
  753 +1937: integer: 0
  754 +1943: integer: 612
  755 +1951: integer: 792
  756 +1957: array_close: ]
  757 +1961: name: /Parent
  758 +1969: integer: 3
  759 +1971: integer: 0
  760 +1973: word: R
  761 +1977: name: /Resources
  762 +1988: dict_open: <<
  763 +1995: name: /Font
  764 +2001: dict_open: <<
  765 +2010: name: /F1
  766 +2014: integer: 25
  767 +2017: integer: 0
  768 +2019: word: R
  769 +2025: dict_close: >>
  770 +2032: name: /ProcSet
  771 +2041: integer: 26
  772 +2044: integer: 0
  773 +2046: word: R
  774 +2050: dict_close: >>
  775 +2055: name: /Type
  776 +2061: name: /Page
  777 +2067: dict_close: >>
  778 +2141: dict_open: <<
  779 +2146: name: /Contents
  780 +2156: integer: 49
  781 +2159: integer: 0
  782 +2161: word: R
  783 +2165: name: /MediaBox
  784 +2175: array_open: [
  785 +2181: integer: 0
  786 +2187: integer: 0
  787 +2193: integer: 612
  788 +2201: integer: 792
  789 +2207: array_close: ]
  790 +2211: name: /Parent
  791 +2219: integer: 3
  792 +2221: integer: 0
  793 +2223: word: R
  794 +2227: name: /Resources
  795 +2238: dict_open: <<
  796 +2245: name: /Font
  797 +2251: dict_open: <<
  798 +2260: name: /F1
  799 +2264: integer: 27
  800 +2267: integer: 0
  801 +2269: word: R
  802 +2275: dict_close: >>
  803 +2282: name: /ProcSet
  804 +2291: integer: 28
  805 +2294: integer: 0
  806 +2296: word: R
  807 +2300: dict_close: >>
  808 +2305: name: /Type
  809 +2311: name: /Page
  810 +2317: dict_close: >>
  811 +2391: dict_open: <<
  812 +2396: name: /Contents
  813 +2406: integer: 51
  814 +2409: integer: 0
  815 +2411: word: R
  816 +2415: name: /MediaBox
  817 +2425: array_open: [
  818 +2431: integer: 0
  819 +2437: integer: 0
  820 +2443: integer: 612
  821 +2451: integer: 792
  822 +2457: array_close: ]
  823 +2461: name: /Parent
  824 +2469: integer: 3
  825 +2471: integer: 0
  826 +2473: word: R
  827 +2477: name: /Resources
  828 +2488: dict_open: <<
  829 +2495: name: /Font
  830 +2501: dict_open: <<
  831 +2510: name: /F1
  832 +2514: integer: 29
  833 +2517: integer: 0
  834 +2519: word: R
  835 +2525: dict_close: >>
  836 +2532: name: /ProcSet
  837 +2541: integer: 30
  838 +2544: integer: 0
  839 +2546: word: R
  840 +2550: dict_close: >>
  841 +2555: name: /Type
  842 +2561: name: /Page
  843 +2567: dict_close: >>
  844 +2642: dict_open: <<
  845 +2647: name: /Contents
  846 +2657: integer: 53
  847 +2660: integer: 0
  848 +2662: word: R
  849 +2666: name: /MediaBox
  850 +2676: array_open: [
  851 +2682: integer: 0
  852 +2688: integer: 0
  853 +2694: integer: 612
  854 +2702: integer: 792
  855 +2708: array_close: ]
  856 +2712: name: /Parent
  857 +2720: integer: 3
  858 +2722: integer: 0
  859 +2724: word: R
  860 +2728: name: /Resources
  861 +2739: dict_open: <<
  862 +2746: name: /Font
  863 +2752: dict_open: <<
  864 +2761: name: /F1
  865 +2765: integer: 31
  866 +2768: integer: 0
  867 +2770: word: R
  868 +2776: dict_close: >>
  869 +2783: name: /ProcSet
  870 +2792: integer: 32
  871 +2795: integer: 0
  872 +2797: word: R
  873 +2801: dict_close: >>
  874 +2806: name: /Type
  875 +2812: name: /Page
  876 +2818: dict_close: >>
  877 +2894: dict_open: <<
  878 +2899: name: /Contents
  879 +2909: integer: 55
  880 +2912: integer: 0
  881 +2914: word: R
  882 +2918: name: /MediaBox
  883 +2928: array_open: [
  884 +2934: integer: 0
  885 +2940: integer: 0
  886 +2946: integer: 612
  887 +2954: integer: 792
  888 +2960: array_close: ]
  889 +2964: name: /Parent
  890 +2972: integer: 3
  891 +2974: integer: 0
  892 +2976: word: R
  893 +2980: name: /Resources
  894 +2991: dict_open: <<
  895 +2998: name: /Font
  896 +3004: dict_open: <<
  897 +3013: name: /F1
  898 +3017: integer: 33
  899 +3020: integer: 0
  900 +3022: word: R
  901 +3028: dict_close: >>
  902 +3035: name: /ProcSet
  903 +3044: integer: 34
  904 +3047: integer: 0
  905 +3049: word: R
  906 +3053: dict_close: >>
  907 +3058: name: /Type
  908 +3064: name: /Page
  909 +3070: dict_close: >>
  910 +3146: dict_open: <<
  911 +3151: name: /Contents
  912 +3161: integer: 57
  913 +3164: integer: 0
  914 +3166: word: R
  915 +3170: name: /MediaBox
  916 +3180: array_open: [
  917 +3186: integer: 0
  918 +3192: integer: 0
  919 +3198: integer: 612
  920 +3206: integer: 792
  921 +3212: array_close: ]
  922 +3216: name: /Parent
  923 +3224: integer: 3
  924 +3226: integer: 0
  925 +3228: word: R
  926 +3232: name: /Resources
  927 +3243: dict_open: <<
  928 +3250: name: /Font
  929 +3256: dict_open: <<
  930 +3265: name: /F1
  931 +3269: integer: 35
  932 +3272: integer: 0
  933 +3274: word: R
  934 +3280: dict_close: >>
  935 +3287: name: /ProcSet
  936 +3296: integer: 36
  937 +3299: integer: 0
  938 +3301: word: R
  939 +3305: dict_close: >>
  940 +3310: name: /Type
  941 +3316: name: /Page
  942 +3322: dict_close: >>
  943 +3387: dict_open: <<
  944 +3392: name: /BaseFont
  945 +3402: name: /Helvetica
  946 +3415: name: /Encoding
  947 +3425: name: /WinAnsiEncoding
  948 +3444: name: /Name
  949 +3450: name: /F1
  950 +3456: name: /Subtype
  951 +3465: name: /Type1
  952 +3474: name: /Type
  953 +3480: name: /Font
  954 +3486: dict_close: >>
  955 +3551: array_open: [
  956 +3555: name: /PDF
  957 +3562: name: /Text
  958 +3568: array_close: ]
  959 +3632: dict_open: <<
  960 +3637: name: /BaseFont
  961 +3647: name: /Helvetica
  962 +3660: name: /Encoding
  963 +3670: name: /WinAnsiEncoding
  964 +3689: name: /Name
  965 +3695: name: /F1
  966 +3701: name: /Subtype
  967 +3710: name: /Type1
  968 +3719: name: /Type
  969 +3725: name: /Font
  970 +3731: dict_close: >>
  971 +3796: array_open: [
  972 +3800: name: /PDF
  973 +3807: name: /Text
  974 +3813: array_close: ]
  975 +3877: dict_open: <<
  976 +3882: name: /BaseFont
  977 +3892: name: /Helvetica
  978 +3905: name: /Encoding
  979 +3915: name: /WinAnsiEncoding
  980 +3934: name: /Name
  981 +3940: name: /F1
  982 +3946: name: /Subtype
  983 +3955: name: /Type1
  984 +3964: name: /Type
  985 +3970: name: /Font
  986 +3976: dict_close: >>
  987 +4041: array_open: [
  988 +4045: name: /PDF
  989 +4052: name: /Text
  990 +4058: array_close: ]
  991 +4122: dict_open: <<
  992 +4127: name: /BaseFont
  993 +4137: name: /Helvetica
  994 +4150: name: /Encoding
  995 +4160: name: /WinAnsiEncoding
  996 +4179: name: /Name
  997 +4185: name: /F1
  998 +4191: name: /Subtype
  999 +4200: name: /Type1
  1000 +4209: name: /Type
  1001 +4215: name: /Font
  1002 +4221: dict_close: >>
  1003 +4286: array_open: [
  1004 +4290: name: /PDF
  1005 +4297: name: /Text
  1006 +4303: array_close: ]
  1007 +4367: dict_open: <<
  1008 +4372: name: /BaseFont
  1009 +4382: name: /Helvetica
  1010 +4395: name: /Encoding
  1011 +4405: name: /WinAnsiEncoding
  1012 +4424: name: /Name
  1013 +4430: name: /F1
  1014 +4436: name: /Subtype
  1015 +4445: name: /Type1
  1016 +4454: name: /Type
  1017 +4460: name: /Font
  1018 +4466: dict_close: >>
  1019 +4531: array_open: [
  1020 +4535: name: /PDF
  1021 +4542: name: /Text
  1022 +4548: array_close: ]
  1023 +4612: dict_open: <<
  1024 +4617: name: /BaseFont
  1025 +4627: name: /Helvetica
  1026 +4640: name: /Encoding
  1027 +4650: name: /WinAnsiEncoding
  1028 +4669: name: /Name
  1029 +4675: name: /F1
  1030 +4681: name: /Subtype
  1031 +4690: name: /Type1
  1032 +4699: name: /Type
  1033 +4705: name: /Font
  1034 +4711: dict_close: >>
  1035 +4776: array_open: [
  1036 +4780: name: /PDF
  1037 +4787: name: /Text
  1038 +4793: array_close: ]
  1039 +4857: dict_open: <<
  1040 +4862: name: /BaseFont
  1041 +4872: name: /Helvetica
  1042 +4885: name: /Encoding
  1043 +4895: name: /WinAnsiEncoding
  1044 +4914: name: /Name
  1045 +4920: name: /F1
  1046 +4926: name: /Subtype
  1047 +4935: name: /Type1
  1048 +4944: name: /Type
  1049 +4950: name: /Font
  1050 +4956: dict_close: >>
  1051 +5021: array_open: [
  1052 +5025: name: /PDF
  1053 +5032: name: /Text
  1054 +5038: array_close: ]
  1055 +5102: dict_open: <<
  1056 +5107: name: /BaseFont
  1057 +5117: name: /Helvetica
  1058 +5130: name: /Encoding
  1059 +5140: name: /WinAnsiEncoding
  1060 +5159: name: /Name
  1061 +5165: name: /F1
  1062 +5171: name: /Subtype
  1063 +5180: name: /Type1
  1064 +5189: name: /Type
  1065 +5195: name: /Font
  1066 +5201: dict_close: >>
  1067 +5266: array_open: [
  1068 +5270: name: /PDF
  1069 +5277: name: /Text
  1070 +5283: array_close: ]
  1071 +5347: dict_open: <<
  1072 +5352: name: /BaseFont
  1073 +5362: name: /Helvetica
  1074 +5375: name: /Encoding
  1075 +5385: name: /WinAnsiEncoding
  1076 +5404: name: /Name
  1077 +5410: name: /F1
  1078 +5416: name: /Subtype
  1079 +5425: name: /Type1
  1080 +5434: name: /Type
  1081 +5440: name: /Font
  1082 +5446: dict_close: >>
  1083 +5511: array_open: [
  1084 +5515: name: /PDF
  1085 +5522: name: /Text
  1086 +5528: array_close: ]
  1087 +5592: dict_open: <<
  1088 +5597: name: /BaseFont
  1089 +5607: name: /Helvetica
  1090 +5620: name: /Encoding
  1091 +5630: name: /WinAnsiEncoding
  1092 +5649: name: /Name
  1093 +5655: name: /F1
  1094 +5661: name: /Subtype
  1095 +5670: name: /Type1
  1096 +5679: name: /Type
  1097 +5685: name: /Font
  1098 +5691: dict_close: >>
  1099 +5756: array_open: [
  1100 +5760: name: /PDF
  1101 +5767: name: /Text
  1102 +5773: array_close: ]
  1103 +5837: dict_open: <<
  1104 +5842: name: /BaseFont
  1105 +5852: name: /Helvetica
  1106 +5865: name: /Encoding
  1107 +5875: name: /WinAnsiEncoding
  1108 +5894: name: /Name
  1109 +5900: name: /F1
  1110 +5906: name: /Subtype
  1111 +5915: name: /Type1
  1112 +5924: name: /Type
  1113 +5930: name: /Font
  1114 +5936: dict_close: >>
  1115 +6001: array_open: [
  1116 +6005: name: /PDF
  1117 +6012: name: /Text
  1118 +6018: array_close: ]
  1119 +6020: eof
  1120 +--- END OBJECT STREAM 1 ---
qpdf/qtest/qpdf/tokens.pdf 0 → 100644
No preview for this file type
qpdf/test_tokenizer.cc 0 → 100644
  1 +#include <qpdf/QPDFTokenizer.hh>
  2 +#include <qpdf/QUtil.hh>
  3 +#include <qpdf/FileInputSource.hh>
  4 +#include <qpdf/BufferInputSource.hh>
  5 +#include <qpdf/QPDF.hh>
  6 +#include <qpdf/Pl_Buffer.hh>
  7 +#include <stdlib.h>
  8 +#include <stdio.h>
  9 +#include <string.h>
  10 +#include <iostream>
  11 +
  12 +static char const* whoami = 0;
  13 +
  14 +void usage()
  15 +{
  16 + std::cerr << "Usage: " << whoami << " filename"
  17 + << std::endl;
  18 + exit(2);
  19 +}
  20 +
  21 +class Finder: public InputSource::Finder
  22 +{
  23 + public:
  24 + Finder(PointerHolder<InputSource> is, std::string const& str) :
  25 + is(is),
  26 + str(str)
  27 + {
  28 + }
  29 + virtual ~Finder()
  30 + {
  31 + }
  32 + virtual bool check();
  33 +
  34 + private:
  35 + PointerHolder<InputSource> is;
  36 + std::string str;
  37 +};
  38 +
  39 +bool
  40 +Finder::check()
  41 +{
  42 + QPDFTokenizer tokenizer;
  43 + QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true);
  44 + qpdf_offset_t offset = this->is->tell();
  45 + bool result = (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str));
  46 + this->is->seek(offset - this->str.length(), SEEK_SET);
  47 + return result;
  48 +}
  49 +
  50 +static char const* tokenTypeName(QPDFTokenizer::token_type_e ttype)
  51 +{
  52 + // Do this is a case statement instead of a lookup so the compiler
  53 + // will warn if we miss any.
  54 + switch (ttype)
  55 + {
  56 + case QPDFTokenizer::tt_bad:
  57 + return "bad";
  58 + case QPDFTokenizer::tt_array_close:
  59 + return "array_close";
  60 + case QPDFTokenizer::tt_array_open:
  61 + return "array_open";
  62 + case QPDFTokenizer::tt_brace_close:
  63 + return "brace_close";
  64 + case QPDFTokenizer::tt_brace_open:
  65 + return "brace_open";
  66 + case QPDFTokenizer::tt_dict_close:
  67 + return "dict_close";
  68 + case QPDFTokenizer::tt_dict_open:
  69 + return "dict_open";
  70 + case QPDFTokenizer::tt_integer:
  71 + return "integer";
  72 + case QPDFTokenizer::tt_name:
  73 + return "name";
  74 + case QPDFTokenizer::tt_real:
  75 + return "real";
  76 + case QPDFTokenizer::tt_string:
  77 + return "string";
  78 + case QPDFTokenizer::tt_null:
  79 + return "null";
  80 + case QPDFTokenizer::tt_bool:
  81 + return "bool";
  82 + case QPDFTokenizer::tt_word:
  83 + return "word";
  84 + case QPDFTokenizer::tt_eof:
  85 + return "eof";
  86 + }
  87 + return 0;
  88 +}
  89 +
  90 +static std::string
  91 +sanitize(std::string const& value)
  92 +{
  93 + std::string result;
  94 + for (std::string::const_iterator iter = value.begin(); iter != value.end();
  95 + ++iter)
  96 + {
  97 + if ((*iter >= 32) && (*iter <= 126))
  98 + {
  99 + result.append(1, *iter);
  100 + }
  101 + else
  102 + {
  103 + result += "\\x" + QUtil::int_to_string_base(
  104 + static_cast<unsigned char>(*iter), 16, 2);
  105 + }
  106 + }
  107 + return result;
  108 +}
  109 +
  110 +static void
  111 +try_skipping(PointerHolder<InputSource> is, char const* what, Finder& f)
  112 +{
  113 + std::cout << "skipping to " << what << std::endl;
  114 + qpdf_offset_t offset = is->tell();
  115 + if (! is->findFirst(what, offset, 0, f))
  116 + {
  117 + std::cout << what << " not found" << std::endl;
  118 + is->seek(offset, SEEK_SET);
  119 + }
  120 +}
  121 +
  122 +static void
  123 +dump_tokens(PointerHolder<InputSource> is, std::string const& label,
  124 + bool skip_streams, bool skip_inline_images)
  125 +{
  126 + Finder f1(is, "endstream");
  127 + Finder f2(is, "EI");
  128 + std::cout << "--- BEGIN " << label << " ---" << std::endl;
  129 + bool done = false;
  130 + QPDFTokenizer tokenizer;
  131 + tokenizer.allowEOF();
  132 + while (! done)
  133 + {
  134 + QPDFTokenizer::Token token = tokenizer.readToken(is, "test", true);
  135 +
  136 + qpdf_offset_t offset = is->tell() - token.getRawValue().length();
  137 + std::cout << offset << ": "
  138 + << tokenTypeName(token.getType());
  139 + if (token.getType() != QPDFTokenizer::tt_eof)
  140 + {
  141 + std::cout << ": "
  142 + << sanitize(token.getValue());
  143 + if (token.getValue() != token.getRawValue())
  144 + {
  145 + std::cout << " (raw: " << sanitize(token.getRawValue()) << ")";
  146 + }
  147 + }
  148 + if (token.getType() == QPDFTokenizer::tt_bad)
  149 + {
  150 + std::cout << " (" << token.getErrorMessage() << ")";
  151 + }
  152 + std::cout << std::endl;
  153 + if (skip_streams &&
  154 + (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "stream")))
  155 + {
  156 + try_skipping(is, "endstream", f1);
  157 + }
  158 + else if (skip_inline_images &&
  159 + (token == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "ID")))
  160 + {
  161 + try_skipping(is, "EI", f2);
  162 + }
  163 + else if (token.getType() == QPDFTokenizer::tt_eof)
  164 + {
  165 + done = true;
  166 + }
  167 + }
  168 + std::cout << "--- END " << label << " ---" << std::endl;
  169 +}
  170 +
  171 +static void process(char const* filename)
  172 +{
  173 + PointerHolder<InputSource> is;
  174 + QPDFTokenizer tokenizer;
  175 + tokenizer.allowEOF();
  176 +
  177 + // Tokenize file, skipping streams
  178 + FileInputSource* fis = new FileInputSource();
  179 + fis->setFilename(filename);
  180 + is = fis;
  181 + dump_tokens(is, "FILE", true, false);
  182 +
  183 + // Tokenize content streams, skipping inline images
  184 + QPDF qpdf;
  185 + qpdf.processFile(filename);
  186 + std::vector<QPDFObjectHandle> pages = qpdf.getAllPages();
  187 + int pageno = 0;
  188 + for (std::vector<QPDFObjectHandle>::iterator iter = pages.begin();
  189 + iter != pages.end(); ++iter)
  190 + {
  191 + ++pageno;
  192 + Pl_Buffer plb("buffer");
  193 + std::vector<QPDFObjectHandle> contents = (*iter).getPageContents();
  194 + for (std::vector<QPDFObjectHandle>::iterator citer = contents.begin();
  195 + citer != contents.end(); ++citer)
  196 + {
  197 + (*citer).pipeStreamData(&plb, 0, qpdf_dl_specialized);
  198 + }
  199 + plb.finish();
  200 + PointerHolder<Buffer> content_data = plb.getBuffer();
  201 + BufferInputSource* bis = new BufferInputSource(
  202 + "content data", content_data.getPointer());
  203 + is = bis;
  204 + dump_tokens(is, "PAGE " + QUtil::int_to_string(pageno), false, true);
  205 + }
  206 +
  207 + // Tokenize object streams
  208 + std::vector<QPDFObjectHandle> all = qpdf.getAllObjects();
  209 + for (std::vector<QPDFObjectHandle>::iterator iter = all.begin();
  210 + iter != all.end(); ++iter)
  211 + {
  212 + if ((*iter).isStream() &&
  213 + (*iter).getDict().getKey("/Type").isName() &&
  214 + (*iter).getDict().getKey("/Type").getName() == "/ObjStm")
  215 + {
  216 + PointerHolder<Buffer> b =
  217 + (*iter).getStreamData(qpdf_dl_specialized);
  218 + BufferInputSource* bis = new BufferInputSource(
  219 + "object stream data", b.getPointer());
  220 + is = bis;
  221 + dump_tokens(is, "OBJECT STREAM " +
  222 + QUtil::int_to_string((*iter).getObjectID()),
  223 + false, false);
  224 + }
  225 + }
  226 +}
  227 +
  228 +int main(int argc, char* argv[])
  229 +{
  230 + QUtil::setLineBuf(stdout);
  231 + if ((whoami = strrchr(argv[0], '/')) == NULL)
  232 + {
  233 + whoami = argv[0];
  234 + }
  235 + else
  236 + {
  237 + ++whoami;
  238 + }
  239 + // For libtool's sake....
  240 + if (strncmp(whoami, "lt-", 3) == 0)
  241 + {
  242 + whoami += 3;
  243 + }
  244 +
  245 + if (argc != 2)
  246 + {
  247 + usage();
  248 + }
  249 +
  250 + char const* filename = argv[1];
  251 + try
  252 + {
  253 + process(filename);
  254 + }
  255 + catch (std::exception& e)
  256 + {
  257 + std::cerr << whoami << ": exception: " << e.what();
  258 + exit(2);
  259 + }
  260 + return 0;
  261 +}