Commit 67d5ed3a64a81f9192c17dc71f02e69f60f8a1f8

Authored by Jay Berkenbilt
1 parent 1e766dcd

Implement remove-unreferenced-resources=auto

qpdf/qpdf.cc
... ... @@ -4750,6 +4750,140 @@ static void handle_transformations(QPDF& pdf, Options& o)
4750 4750 }
4751 4751 }
4752 4752  
  4753 +static bool should_remove_unreferenced_resources(QPDF& pdf, Options& o)
  4754 +{
  4755 + if (o.remove_unreferenced_page_resources == re_no)
  4756 + {
  4757 + return false;
  4758 + }
  4759 + else if (o.remove_unreferenced_page_resources == re_yes)
  4760 + {
  4761 + return true;
  4762 + }
  4763 +
  4764 + // Unreferenced resources are common in files where resources
  4765 + // dictionaries are shared across pages. As a heuristic, we look
  4766 + // in the file for shared resources dictionaries or shared XObject
  4767 + // subkeys of resources dictionaries either on pages or on form
  4768 + // XObjects in pages. If we find any, then there is a higher
  4769 + // likeilihood that the expensive process of finding unreferenced
  4770 + // resources is worth it.
  4771 +
  4772 + // Return true as soon as we find any shared resources.
  4773 +
  4774 + std::set<QPDFObjGen> resources_seen; // shared resources detection
  4775 + std::set<QPDFObjGen> nodes_seen; // loop detection
  4776 +
  4777 + if (o.verbose)
  4778 + {
  4779 + std::cout << whoami << ": " << pdf.getFilename()
  4780 + << ": checking for shared resources" << std::endl;
  4781 + }
  4782 +
  4783 + std::list<QPDFObjectHandle> queue;
  4784 + queue.push_back(pdf.getRoot().getKey("/Pages"));
  4785 + while (! queue.empty())
  4786 + {
  4787 + QPDFObjectHandle node = *queue.begin();
  4788 + QPDFObjGen og = node.getObjGen();
  4789 + if (nodes_seen.count(og))
  4790 + {
  4791 + continue;
  4792 + }
  4793 + nodes_seen.insert(og);
  4794 + queue.pop_front();
  4795 + QPDFObjectHandle dict = node.isStream() ? node.getDict() : node;
  4796 + QPDFObjectHandle kids = dict.getKey("/Kids");
  4797 + if (kids.isArray())
  4798 + {
  4799 + // This is a non-leaf node.
  4800 + if (dict.hasKey("/Resources"))
  4801 + {
  4802 + QTC::TC("qpdf", "qpdf found resources in non-leaf");
  4803 + if (o.verbose)
  4804 + {
  4805 + std::cout << " found resources in non-leaf page node "
  4806 + << og.getObj() << " " << og.getGen()
  4807 + << std::endl;
  4808 + }
  4809 + return true;
  4810 + }
  4811 + int n = kids.getArrayNItems();
  4812 + for (int i = 0; i < n; ++i)
  4813 + {
  4814 + queue.push_back(kids.getArrayItem(i));
  4815 + }
  4816 + }
  4817 + else
  4818 + {
  4819 + // This is a leaf node or a form XObject.
  4820 + QPDFObjectHandle resources = dict.getKey("/Resources");
  4821 + if (resources.isIndirect())
  4822 + {
  4823 + QPDFObjGen resources_og = resources.getObjGen();
  4824 + if (resources_seen.count(resources_og))
  4825 + {
  4826 + QTC::TC("qpdf", "qpdf found shared resources in leaf");
  4827 + if (o.verbose)
  4828 + {
  4829 + std::cout << " found shared resources in leaf node "
  4830 + << og.getObj() << " " << og.getGen()
  4831 + << ": "
  4832 + << resources_og.getObj() << " "
  4833 + << resources_og.getGen()
  4834 + << std::endl;
  4835 + }
  4836 + return true;
  4837 + }
  4838 + resources_seen.insert(resources_og);
  4839 + }
  4840 + QPDFObjectHandle xobject = resources.getKey("/XObject");
  4841 + if (xobject.isIndirect())
  4842 + {
  4843 + QPDFObjGen xobject_og = xobject.getObjGen();
  4844 + if (resources_seen.count(xobject_og))
  4845 + {
  4846 + QTC::TC("qpdf", "qpdf found shared xobject in leaf");
  4847 + if (o.verbose)
  4848 + {
  4849 + std::cout << " found shared xobject in leaf node "
  4850 + << og.getObj() << " " << og.getGen()
  4851 + << ": "
  4852 + << xobject_og.getObj() << " "
  4853 + << xobject_og.getGen()
  4854 + << std::endl;
  4855 + }
  4856 + return true;
  4857 + }
  4858 + resources_seen.insert(xobject_og);
  4859 + }
  4860 + if (xobject.isDictionary())
  4861 + {
  4862 + for (auto k: xobject.getKeys())
  4863 + {
  4864 + QPDFObjectHandle xobj = xobject.getKey(k);
  4865 + if (xobj.isStream() &&
  4866 + xobj.getDict().getKey("/Type").isName() &&
  4867 + ("/XObject" ==
  4868 + xobj.getDict().getKey("/Type").getName()) &&
  4869 + xobj.getDict().getKey("/Subtype").isName() &&
  4870 + ("/Form" ==
  4871 + xobj.getDict().getKey("/Subtype").getName()))
  4872 + {
  4873 + queue.push_back(xobj);
  4874 + }
  4875 + }
  4876 + }
  4877 + }
  4878 + }
  4879 +
  4880 + if (o.verbose)
  4881 + {
  4882 + std::cout << whoami << ": no shared resources found" << std::endl;
  4883 + }
  4884 + return false;
  4885 +}
  4886 +
4753 4887 static void handle_page_specs(QPDF& pdf, Options& o)
4754 4888 {
4755 4889 // Parse all page specifications and translate them into lists of
... ... @@ -4883,8 +5017,12 @@ static void handle_page_specs(QPDF&amp; pdf, Options&amp; o)
4883 5017 cis = page_spec_cfis[filename];
4884 5018 cis->stayOpen(true);
4885 5019 }
4886   - QPDFPageDocumentHelper dh(*((*iter).second));
4887   - dh.removeUnreferencedResources();
  5020 + QPDF& other(*((*iter).second));
  5021 + if (should_remove_unreferenced_resources(other, o))
  5022 + {
  5023 + QPDFPageDocumentHelper dh(other);
  5024 + dh.removeUnreferencedResources();
  5025 + }
4888 5026 if (cis)
4889 5027 {
4890 5028 cis->stayOpen(false);
... ... @@ -5368,7 +5506,7 @@ static void do_split_pages(QPDF&amp; pdf, Options&amp; o)
5368 5506 before = std::string(o.outfilename) + "-";
5369 5507 }
5370 5508  
5371   - if (o.remove_unreferenced_page_resources != re_no)
  5509 + if (should_remove_unreferenced_resources(pdf, o))
5372 5510 {
5373 5511 QPDFPageDocumentHelper dh(pdf);
5374 5512 dh.removeUnreferencedResources();
... ...
qpdf/qpdf.testcov
... ... @@ -450,3 +450,6 @@ QPDFWriter no encryption sig contents 0
450 450 QPDFPageObjectHelper colorspace lookup 0
451 451 QPDFWriter ignore XRef in qdf mode 0
452 452 QPDFPageObjectHelper filter form xobject 0
  453 +qpdf found resources in non-leaf 0
  454 +qpdf found shared resources in leaf 0
  455 +qpdf found shared xobject in leaf 0
... ...
qpdf/qtest/qpdf.test
... ... @@ -1699,7 +1699,7 @@ my @sp_cases = (
1699 1699 [11, 'pdf extension', '', 'split-out.Pdf'],
1700 1700 [4, 'fallback', '--pages 11-pages.pdf 1-3 minimal.pdf --', 'split-out'],
1701 1701 );
1702   -$n_tests += 32;
  1702 +$n_tests += 35;
1703 1703 $n_compare_pdfs += 1;
1704 1704 for (@sp_cases)
1705 1705 {
... ... @@ -1808,6 +1808,7 @@ foreach my $i (qw(1 2 3 4))
1808 1808 $td->runtest("unreferenced resources with bad token",
1809 1809 {$td->COMMAND =>
1810 1810 "qpdf --qdf --static-id --split-pages=2" .
  1811 + " --remove-unreferenced-resources=yes" .
1811 1812 " coalesce.pdf split-out-bad-token.pdf"},
1812 1813 {$td->FILE => "coalesce-split.out", $td->EXIT_STATUS => 3},
1813 1814 $td->NORMALIZE_NEWLINES);
... ... @@ -1834,6 +1835,18 @@ $td-&gt;runtest(&quot;check output&quot;,
1834 1835 {$td->FILE => "shared-form-images-merged.pdf"});
1835 1836 compare_pdfs("shared-form-images.pdf", "a.pdf");
1836 1837  
  1838 +$td->runtest("shared form xobject subkey",
  1839 + {$td->COMMAND => "qpdf --qdf --static-id --split-pages".
  1840 + " shared-form-images-xobject.pdf" .
  1841 + " split-out-shared-form-xobject.pdf"},
  1842 + {$td->STRING => "", $td->EXIT_STATUS => 0});
  1843 +foreach my $i (qw(1 2))
  1844 +{
  1845 + $td->runtest("check output ($i)",
  1846 + {$td->FILE => "split-out-shared-form-xobject-$i.pdf"},
  1847 + {$td->FILE => "shared-form-xobject-split-$i.pdf"});
  1848 +}
  1849 +
1837 1850 show_ntests();
1838 1851 # ----------
1839 1852 $td->notify("--- Keep Files Open ---");
... ...
qpdf/qtest/qpdf/disable-kfo.out
... ... @@ -50,6 +50,110 @@ qpdf: processing 048-kfo.pdf
50 50 qpdf: processing 049-kfo.pdf
51 51 qpdf: processing 050-kfo.pdf
52 52 qpdf: processing 051-kfo.pdf
  53 +qpdf: empty PDF: checking for shared resources
  54 +qpdf: no shared resources found
  55 +qpdf: 001-kfo.pdf: checking for shared resources
  56 +qpdf: no shared resources found
  57 +qpdf: 002-kfo.pdf: checking for shared resources
  58 +qpdf: no shared resources found
  59 +qpdf: 003-kfo.pdf: checking for shared resources
  60 +qpdf: no shared resources found
  61 +qpdf: 004-kfo.pdf: checking for shared resources
  62 +qpdf: no shared resources found
  63 +qpdf: 005-kfo.pdf: checking for shared resources
  64 +qpdf: no shared resources found
  65 +qpdf: 006-kfo.pdf: checking for shared resources
  66 +qpdf: no shared resources found
  67 +qpdf: 007-kfo.pdf: checking for shared resources
  68 +qpdf: no shared resources found
  69 +qpdf: 008-kfo.pdf: checking for shared resources
  70 +qpdf: no shared resources found
  71 +qpdf: 009-kfo.pdf: checking for shared resources
  72 +qpdf: no shared resources found
  73 +qpdf: 010-kfo.pdf: checking for shared resources
  74 +qpdf: no shared resources found
  75 +qpdf: 011-kfo.pdf: checking for shared resources
  76 +qpdf: no shared resources found
  77 +qpdf: 012-kfo.pdf: checking for shared resources
  78 +qpdf: no shared resources found
  79 +qpdf: 013-kfo.pdf: checking for shared resources
  80 +qpdf: no shared resources found
  81 +qpdf: 014-kfo.pdf: checking for shared resources
  82 +qpdf: no shared resources found
  83 +qpdf: 015-kfo.pdf: checking for shared resources
  84 +qpdf: no shared resources found
  85 +qpdf: 016-kfo.pdf: checking for shared resources
  86 +qpdf: no shared resources found
  87 +qpdf: 017-kfo.pdf: checking for shared resources
  88 +qpdf: no shared resources found
  89 +qpdf: 018-kfo.pdf: checking for shared resources
  90 +qpdf: no shared resources found
  91 +qpdf: 019-kfo.pdf: checking for shared resources
  92 +qpdf: no shared resources found
  93 +qpdf: 020-kfo.pdf: checking for shared resources
  94 +qpdf: no shared resources found
  95 +qpdf: 021-kfo.pdf: checking for shared resources
  96 +qpdf: no shared resources found
  97 +qpdf: 022-kfo.pdf: checking for shared resources
  98 +qpdf: no shared resources found
  99 +qpdf: 023-kfo.pdf: checking for shared resources
  100 +qpdf: no shared resources found
  101 +qpdf: 024-kfo.pdf: checking for shared resources
  102 +qpdf: no shared resources found
  103 +qpdf: 025-kfo.pdf: checking for shared resources
  104 +qpdf: no shared resources found
  105 +qpdf: 026-kfo.pdf: checking for shared resources
  106 +qpdf: no shared resources found
  107 +qpdf: 027-kfo.pdf: checking for shared resources
  108 +qpdf: no shared resources found
  109 +qpdf: 028-kfo.pdf: checking for shared resources
  110 +qpdf: no shared resources found
  111 +qpdf: 029-kfo.pdf: checking for shared resources
  112 +qpdf: no shared resources found
  113 +qpdf: 030-kfo.pdf: checking for shared resources
  114 +qpdf: no shared resources found
  115 +qpdf: 031-kfo.pdf: checking for shared resources
  116 +qpdf: no shared resources found
  117 +qpdf: 032-kfo.pdf: checking for shared resources
  118 +qpdf: no shared resources found
  119 +qpdf: 033-kfo.pdf: checking for shared resources
  120 +qpdf: no shared resources found
  121 +qpdf: 034-kfo.pdf: checking for shared resources
  122 +qpdf: no shared resources found
  123 +qpdf: 035-kfo.pdf: checking for shared resources
  124 +qpdf: no shared resources found
  125 +qpdf: 036-kfo.pdf: checking for shared resources
  126 +qpdf: no shared resources found
  127 +qpdf: 037-kfo.pdf: checking for shared resources
  128 +qpdf: no shared resources found
  129 +qpdf: 038-kfo.pdf: checking for shared resources
  130 +qpdf: no shared resources found
  131 +qpdf: 039-kfo.pdf: checking for shared resources
  132 +qpdf: no shared resources found
  133 +qpdf: 040-kfo.pdf: checking for shared resources
  134 +qpdf: no shared resources found
  135 +qpdf: 041-kfo.pdf: checking for shared resources
  136 +qpdf: no shared resources found
  137 +qpdf: 042-kfo.pdf: checking for shared resources
  138 +qpdf: no shared resources found
  139 +qpdf: 043-kfo.pdf: checking for shared resources
  140 +qpdf: no shared resources found
  141 +qpdf: 044-kfo.pdf: checking for shared resources
  142 +qpdf: no shared resources found
  143 +qpdf: 045-kfo.pdf: checking for shared resources
  144 +qpdf: no shared resources found
  145 +qpdf: 046-kfo.pdf: checking for shared resources
  146 +qpdf: no shared resources found
  147 +qpdf: 047-kfo.pdf: checking for shared resources
  148 +qpdf: no shared resources found
  149 +qpdf: 048-kfo.pdf: checking for shared resources
  150 +qpdf: no shared resources found
  151 +qpdf: 049-kfo.pdf: checking for shared resources
  152 +qpdf: no shared resources found
  153 +qpdf: 050-kfo.pdf: checking for shared resources
  154 +qpdf: no shared resources found
  155 +qpdf: 051-kfo.pdf: checking for shared resources
  156 +qpdf: no shared resources found
53 157 qpdf: removing unreferenced pages from primary input
54 158 qpdf: adding pages from 001-kfo.pdf
55 159 qpdf: adding pages from 002-kfo.pdf
... ...
qpdf/qtest/qpdf/enable-kfo.out
... ... @@ -9,6 +9,28 @@ qpdf: processing 016-kfo.pdf
9 9 qpdf: processing 017-kfo.pdf
10 10 qpdf: processing 018-kfo.pdf
11 11 qpdf: processing 019-kfo.pdf
  12 +qpdf: empty PDF: checking for shared resources
  13 +qpdf: no shared resources found
  14 +qpdf: 010-kfo.pdf: checking for shared resources
  15 +qpdf: no shared resources found
  16 +qpdf: 011-kfo.pdf: checking for shared resources
  17 +qpdf: no shared resources found
  18 +qpdf: 012-kfo.pdf: checking for shared resources
  19 +qpdf: no shared resources found
  20 +qpdf: 013-kfo.pdf: checking for shared resources
  21 +qpdf: no shared resources found
  22 +qpdf: 014-kfo.pdf: checking for shared resources
  23 +qpdf: no shared resources found
  24 +qpdf: 015-kfo.pdf: checking for shared resources
  25 +qpdf: no shared resources found
  26 +qpdf: 016-kfo.pdf: checking for shared resources
  27 +qpdf: no shared resources found
  28 +qpdf: 017-kfo.pdf: checking for shared resources
  29 +qpdf: no shared resources found
  30 +qpdf: 018-kfo.pdf: checking for shared resources
  31 +qpdf: no shared resources found
  32 +qpdf: 019-kfo.pdf: checking for shared resources
  33 +qpdf: no shared resources found
12 34 qpdf: removing unreferenced pages from primary input
13 35 qpdf: adding pages from 010-kfo.pdf
14 36 qpdf: adding pages from 011-kfo.pdf
... ...
qpdf/qtest/qpdf/kfo-n.out
... ... @@ -7,6 +7,26 @@ qpdf: processing 006-kfo.pdf
7 7 qpdf: processing 007-kfo.pdf
8 8 qpdf: processing 008-kfo.pdf
9 9 qpdf: processing 009-kfo.pdf
  10 +qpdf: empty PDF: checking for shared resources
  11 +qpdf: no shared resources found
  12 +qpdf: 001-kfo.pdf: checking for shared resources
  13 +qpdf: no shared resources found
  14 +qpdf: 002-kfo.pdf: checking for shared resources
  15 +qpdf: no shared resources found
  16 +qpdf: 003-kfo.pdf: checking for shared resources
  17 +qpdf: no shared resources found
  18 +qpdf: 004-kfo.pdf: checking for shared resources
  19 +qpdf: no shared resources found
  20 +qpdf: 005-kfo.pdf: checking for shared resources
  21 +qpdf: no shared resources found
  22 +qpdf: 006-kfo.pdf: checking for shared resources
  23 +qpdf: no shared resources found
  24 +qpdf: 007-kfo.pdf: checking for shared resources
  25 +qpdf: no shared resources found
  26 +qpdf: 008-kfo.pdf: checking for shared resources
  27 +qpdf: no shared resources found
  28 +qpdf: 009-kfo.pdf: checking for shared resources
  29 +qpdf: no shared resources found
10 30 qpdf: removing unreferenced pages from primary input
11 31 qpdf: adding pages from 001-kfo.pdf
12 32 qpdf: adding pages from 002-kfo.pdf
... ...
qpdf/qtest/qpdf/kfo-y.out
... ... @@ -7,6 +7,26 @@ qpdf: processing 006-kfo.pdf
7 7 qpdf: processing 007-kfo.pdf
8 8 qpdf: processing 008-kfo.pdf
9 9 qpdf: processing 009-kfo.pdf
  10 +qpdf: empty PDF: checking for shared resources
  11 +qpdf: no shared resources found
  12 +qpdf: 001-kfo.pdf: checking for shared resources
  13 +qpdf: no shared resources found
  14 +qpdf: 002-kfo.pdf: checking for shared resources
  15 +qpdf: no shared resources found
  16 +qpdf: 003-kfo.pdf: checking for shared resources
  17 +qpdf: no shared resources found
  18 +qpdf: 004-kfo.pdf: checking for shared resources
  19 +qpdf: no shared resources found
  20 +qpdf: 005-kfo.pdf: checking for shared resources
  21 +qpdf: no shared resources found
  22 +qpdf: 006-kfo.pdf: checking for shared resources
  23 +qpdf: no shared resources found
  24 +qpdf: 007-kfo.pdf: checking for shared resources
  25 +qpdf: no shared resources found
  26 +qpdf: 008-kfo.pdf: checking for shared resources
  27 +qpdf: no shared resources found
  28 +qpdf: 009-kfo.pdf: checking for shared resources
  29 +qpdf: no shared resources found
10 30 qpdf: removing unreferenced pages from primary input
11 31 qpdf: adding pages from 001-kfo.pdf
12 32 qpdf: adding pages from 002-kfo.pdf
... ...
qpdf/qtest/qpdf/shared-form-images-xobject.pdf 0 โ†’ 100644
No preview for this file type
qpdf/qtest/qpdf/shared-form-xobject-split-1.pdf 0 โ†’ 100644
No preview for this file type
qpdf/qtest/qpdf/shared-form-xobject-split-2.pdf 0 โ†’ 100644
No preview for this file type
qpdf/qtest/qpdf/split-pages-group.out
  1 +qpdf: 11-pages.pdf: checking for shared resources
  2 +qpdf: no shared resources found
1 3 qpdf: wrote file split-out-group-01-05.pdf
2 4 qpdf: wrote file split-out-group-06-10.pdf
3 5 qpdf: wrote file split-out-group-11-11.pdf
... ...
qpdf/qtest/qpdf/uo-6.out
1 1 qpdf: selecting --keep-open-files=y
  2 +qpdf: fxo-red.pdf: checking for shared resources
  3 +qpdf: no shared resources found
2 4 qpdf: removing unreferenced pages from primary input
3 5 qpdf: adding pages from fxo-red.pdf
4 6 qpdf: processing underlay/overlay
... ...
qpdf/qtest/qpdf/verbose-merge.out
... ... @@ -2,6 +2,14 @@ qpdf: selecting --keep-open-files=y
2 2 qpdf: processing 20-pages.pdf
3 3 qpdf: processing ./20-pages.pdf
4 4 qpdf: processing minimal.pdf
  5 +qpdf: ./20-pages.pdf: checking for shared resources
  6 +qpdf: no shared resources found
  7 +qpdf: 20-pages.pdf: checking for shared resources
  8 +qpdf: no shared resources found
  9 +qpdf: minimal.pdf: checking for shared resources
  10 +qpdf: no shared resources found
  11 +qpdf: page-labels-and-outlines.pdf: checking for shared resources
  12 +qpdf: no shared resources found
5 13 qpdf: removing unreferenced pages from primary input
6 14 qpdf: adding pages from page-labels-and-outlines.pdf
7 15 qpdf: adding pages from 20-pages.pdf
... ...