diff --git a/manual/qpdf-manual.xml b/manual/qpdf-manual.xml
index 8d8ecaa..b4b07f4 100644
--- a/manual/qpdf-manual.xml
+++ b/manual/qpdf-manual.xml
@@ -1940,178 +1940,235 @@ outfile.pdf
QPDF JSON
-
- Beginning with qpdf version 8.3.0, the qpdf
- command-line program can produce a json representation of the
- non-content data in a PDF file. It includes a dump in json format
- of all objects in the PDF file excluding the content of streams.
- This json representation makes it very easy to look in detail at
- the structure of a given PDF file, and it also provides a great way
- to work with PDF files programmatically from the command-line in
- languages that can't call or link with the qpdf library directly.
- Note that stream data can be extracted from PDF files using other
- qpdf command-line options.
-
-
- The qpdf json representation includes a json serialization of the
- raw objects in the PDF file as well as some computed information in
- a more easily extracted format. QPDF provides some guarantees about
- its json format. These guarantees are designed to simplify the
- experience of a developer working with the JSON format.
-
-
- Compatibility
+
+ Overview
+
+ Beginning with qpdf version 8.3.0, the qpdf
+ command-line program can produce a json representation of the
+ non-content data in a PDF file. It includes a dump in json format
+ of all objects in the PDF file excluding the content of streams.
+ This json representation makes it very easy to look in detail at
+ the structure of a given PDF file, and it also provides a great way
+ to work with PDF files programmatically from the command-line in
+ languages that can't call or link with the qpdf library directly.
+ Note that stream data can be extracted from PDF files using other
+ qpdf command-line options.
+
+
+
+ JSON Guarantees
+
+ The qpdf json representation includes a json serialization of the
+ raw objects in the PDF file as well as some computed information in
+ a more easily extracted format. QPDF provides some guarantees about
+ its json format. These guarantees are designed to simplify the
+ experience of a developer working with the JSON format.
+
+
+ Compatibility
+
+
+ The top-level json object output is a dictionary. The json
+ output contains various nested dictionaries and arrays. With
+ the exception of dictionaries that are populated by the fields
+ of objects from the file, all instances of a dictionary are
+ guaranteed to have exactly the same keys. Future versions of
+ qpdf are free to add additional keys but not to remove keys or
+ change the type of object that a key points to. The qpdf
+ program validates this guarantee, and in the unlikely event
+ that a bug in qpdf should cause it to generate data that
+ doesn't conform to this rule, it will ask you to file a bug
+ report.
+
+
+ The top-level json structure contains a
+ “version” key whose value is
+ simple integer. The value of the version key
+ will be incremented if a non-compatible change is made. A
+ non-compatible change would be any change that involves removal
+ of a key, a change to the format of data pointed to by a key,
+ or a semantic change that requires a different interpretation
+ of a previously existing key. A strong effort will be made to
+ avoid breaking compatibility.
+
+
+
+
+ Documentation
+
+
+ The qpdf command can be invoked with the
+ option. This will output a json
+ structure that has the same structure as the json output that
+ qpdf generates, except that each field in the help output is a
+ description of the corresponding field in the json output. The
+ specific guarantees are as follows:
+
+
+
+ A dictionary in the help output means that the corresponding
+ location in the actual json output is also a dictionary with
+ exactly the same keys; that is, no keys present in help are
+ absent in the real output, and no keys will be present in
+ the real output that are not in help.
+
+
+
+
+ A string in the help output is a description of the item
+ that appears in the corresponding location of the actual
+ output. The corresponding output can have any format.
+
+
+
+
+ An array in the help output always contains a single
+ element. It indicates that the corresponding location in the
+ actual output is also an array, and that each element of the
+ array has whatever format is implied by the single element
+ of the help output's array.
+
+
+
+ For example, the help output indicates includes a
+ “pagelabels” key whose value is
+ an array of one element. That element is a dictionary with keys
+ “index” and
+ “label”. In addition to
+ describing the meaning of those keys, this tells you that the
+ actual json output will contain a pagelabels
+ array, each of whose elements is a dictionary that contains an
+ index key, a label key,
+ and no other keys.
+
+
+
+
+ Directness and Simplicity
+
+
+ The json output contains the value of every object in the file,
+ but it also contains some processed data. This is analogous to
+ how qpdf's library interface works. The processed data is
+ similar to the helper functions in that it allows you to look
+ at certain aspects of the PDF file without having to understand
+ all the nuances of the PDF specification, while the raw objects
+ allow you to mine the PDF for anything that the higher-level
+ interfaces are lacking.
+
+
+
+
+
+
+
+ Limitations of JSON Representation
+
+ There are a few limitations to be aware of with the json structure:
+
- The top-level json object output is a dictionary. The json
- output contains various nested dictionaries and arrays. With
- the exception of dictionaries that are populated by the fields
- of objects from the file, all instances of a dictionary are
- guaranteed to have exactly the same keys. Future versions of
- qpdf are free to add additional keys but not to remove keys or
- change the type of object that a key points to. The qpdf
- program validates this guarantee, and in the unlikely event
- that a bug in qpdf should cause it to generate data that
- doesn't conform to this rule, it will ask you to file a bug
- report.
+ Strings, names, and indirect object references in the original
+ PDF file are all converted to strings in the json
+ representation. In the case of a “normal” PDF file,
+ you can tell the difference because a name starts with a slash
+ (/), and an indirect object reference looks
+ like n n R, but if there were to be a string
+ that looked like a name or indirect object reference, there
+ would be no way to tell this from the json output. Note that
+ there are certain cases where you know for sure what something
+ is, such as knowing that dictionary keys in objects are always
+ names and that certain things in the higher-level computed data
+ are known to contain indirect object references.
+
+
- The top-level json structure contains a
- “version” key whose value is
- simple integer. The value of the version key
- will be incremented if a non-compatible change is made. A
- non-compatible change would be any change that involves removal
- of a key, a change to the format of data pointed to by a key,
- or a semantic change that requires a different interpretation
- of a previously existing key. A strong effort will be made to
- avoid breaking compatibility.
+ The json format doesn't support binary data very well. Mostly
+ the details are not important, but they are presented here for
+ information. When qpdf outputs a string in the json
+ representation, it converts the string to UTF-8, assuming usual
+ PDF string semantics. Specifically, if the original string is
+ UTF-16, it is converted to UTF-8. Otherwise, it is assumed to
+ have PDF doc encoding, and is converted to UTF-8 with that
+ assumption. This causes strange things to happen to binary
+ strings. For example, if you had the binary string
+ <038051>, this would be output to the
+ json as \u0003•Q because
+ 03 is not a printable character and
+ 80 is the bullet character in PDF doc
+ encoding and is mapped to the Unicode value
+ 2022. Since 51 is
+ Q, it is output as is. If you wanted to
+ convert back from here to a binary string, would have to
+ recognize Unicode values whose code points are higher than
+ 0xFF and map those back to their
+ corresponding PDF doc encoding characters. There is no way to
+ tell the difference between a Unicode string that was originally
+ encoded as UTF-16 or one that was converted from PDF doc
+ encoding. In other words, it's best if you don't try to use the
+ json format to extract binary strings from the PDF file, but if
+ you really had to, it could be done. Note that qpdf's
+ option does not have this
+ limitation and will reveal the string as encoded in the original
+ file.
-
-
- Documentation
+
+
+
+
+ JSON: Special Considerations
+
+ For the most part, the built-in JSON help tells you everything you
+ need to know about the JSON format, but there are a few
+ non-obvious things to be aware of:
+
- The qpdf command can be invoked with the
- option. This will output a json
- structure that has the same structure as the json output that
- qpdf generates, except that each field in the help output is a
- description of the corresponding field in the json output. The
- specific guarantees are as follows:
-
-
-
- A dictionary in the help output means that the corresponding
- location in the actual json output is also a dictionary with
- exactly the same keys; that is, no keys present in help are
- absent in the real output, and no keys will be present in
- the real output that are not in help.
-
-
-
-
- A string in the help output is a description of the item
- that appears in the corresponding location of the actual
- output. The corresponding output can have any format.
-
-
-
-
- An array in the help output always contains a single
- element. It indicates that the corresponding location in the
- actual output is also an array, and that each element of the
- array has whatever format is implied by the single element
- of the help output's array.
-
-
-
- For example, the help output indicates includes a
- “pagelabels” key whose value is
- an array of one element. That element is a dictionary with keys
- “index” and
- “label”. In addition to
- describing the meaning of those keys, this tells you that the
- actual json output will contain a pagelabels
- array, each of whose elements is a dictionary that contains an
- index key, a label key,
- and no other keys.
+ While qpdf guarantees that keys present in the help will be
+ present in the output, those fields may be null or empty if the
+ information is not known or absent in the file. Also, if you
+ specify , the keys that are not
+ listed will be excluded entirely except for those that
+ says are always present.
-
-
- Directness and Simplicity
- The json output contains the value of every object in the file,
- but it also contains some processed data. This is analogous to
- how qpdf's library interface works. The processed data is
- similar to the helper functions in that it allows you to look
- at certain aspects of the PDF file without having to understand
- all the nuances of the PDF specification, while the raw objects
- allow you to mine the PDF for anything that the higher-level
- interfaces are lacking.
+ In a few places, there are keys with names containing
+ pageposfrom1. The values of these keys are
+ null or an integer. If an integer, they point to a page index
+ within the file numbering from 1. Note that json indexes from
+ 0, and you would also use 0-based indexing using the API.
+ However, 1-based indexing is easier in this case because the
+ command-line syntax for specifying page ranges is 1-based. If
+ you were going to write a program that looked through the json
+ for information about specific pages and then use the
+ command-line to extract those pages, 1-based indexing is
+ easier. Besides, it's more convenient to subtract 1 from a
+ program in a real programming language than it is to add 1 from
+ shell code.
-
-
-
-
- There are a few limitations to be aware of with the json structure:
-
-
-
- Strings, names, and indirect object references in the original
- PDF file are all converted to strings in the json
- representation. In the case of a “normal” PDF file,
- you can tell the difference because a name starts with a slash
- (/), and an indirect object reference looks
- like n n R, but if there were to be a string
- that looked like a name or indirect object reference, there
- would be no way to tell this from the json output. Note that
- there are certain cases where you know for sure what something
- is, such as knowing that dictionary keys in objects are always
- names and that certain things in the higher-level computed data
- are known to contain indirect object references.
-
-
-
-
- The json format doesn't support binary data very well. Mostly
- the details are not important, but they are presented here for
- information. When qpdf outputs a string in the json
- representation, it converts the string to UTF-8, assuming usual
- PDF string semantics. Specifically, if the original string is
- UTF-16, it is converted to UTF-8. Otherwise, it is assumed to
- have PDF doc encoding, and is converted to UTF-8 with that
- assumption. This causes strange things to happen to binary
- strings. For example, if you had the binary string
- <038051>, this would be output to the
- json as \u0003•Q because
- 03 is not a printable character and
- 80 is the bullet character in PDF doc
- encoding and is mapped to the Unicode value
- 2022. Since 51 is
- Q, it is output as is. If you wanted to
- convert back from here to a binary string, would have to
- recognize Unicode values whose code points are higher than
- 0xFF and map those back to their
- corresponding PDF doc encoding characters. There is no way to
- tell the difference between a Unicode string that was originally
- encoded as UTF-16 or one that was converted from PDF doc
- encoding. In other words, it's best if you don't try to use the
- json format to extract binary strings from the PDF file, but if
- you really had to, it could be done. Note that qpdf's
- option does not have this
- limitation and will reveal the string as encoded in the original
- file.
-
-
-
-
-
- For specific details on the information provided in the json
- output, please run qpdf --json-help.
-
+
+
+ The image information included in the page
+ section of the json output includes the key
+ “filterable”. Note that the
+ value of this field may depend on the
+ that you invoke qpdf with. The
+ json output includes a top-level key
+ “parameters” that indicates the
+ decode level used for computing whether a stream was
+ filterable. For example, jpeg images will be shown as not
+ filterable by default, but they will be shown as filterable if
+ you run qpdf --json --decode-level=all.
+
+
+
+
+ Design and Library Notes
diff --git a/qpdf/qpdf.cc b/qpdf/qpdf.cc
index 45bce84..94609bd 100644
--- a/qpdf/qpdf.cc
+++ b/qpdf/qpdf.cc
@@ -338,6 +338,9 @@ static JSON json_schema(std::set* keys = 0)
outline.addDictionaryMember(
"dest",
JSON::makeString("outline destination dictionary"));
+ page.addDictionaryMember(
+ "pageposfrom1",
+ JSON::makeString("position of page in document numbering from 1"));
}
if (all_keys || keys->count("pagelabels"))
{
@@ -371,6 +374,10 @@ static JSON json_schema(std::set* keys = 0)
outlines.addDictionaryMember(
"open",
JSON::makeString("whether the outline is displayed expanded"));
+ outlines.addDictionaryMember(
+ "destpageposfrom1",
+ JSON::makeString("position of destination page in document"
+ " numbered from 1; null if not known"));
}
return schema;
}
@@ -2813,6 +2820,7 @@ static void do_json_pages(QPDF& pdf, Options& o, JSON& j)
j_outline.addDictionaryMember(
"dest", (*oiter).getDest().getJSON(true));
}
+ j_page.addDictionaryMember("pageposfrom1", JSON::makeInt(1 + pageno));
}
}
@@ -2847,7 +2855,8 @@ static void do_json_page_labels(QPDF& pdf, Options& o, JSON& j)
}
static void add_outlines_to_json(
- std::list outlines, JSON& j)
+ std::list outlines, JSON& j,
+ std::map& page_numbers)
{
for (std::list::iterator iter = outlines.begin();
iter != outlines.end(); ++iter)
@@ -2858,17 +2867,39 @@ static void add_outlines_to_json(
jo.addDictionaryMember("title", JSON::makeString(ol.getTitle()));
jo.addDictionaryMember("dest", ol.getDest().getJSON(true));
jo.addDictionaryMember("open", JSON::makeBool(ol.getCount() >= 0));
+ QPDFObjectHandle page = ol.getDestPage();
+ JSON j_destpage = JSON::makeNull();
+ if (page.isIndirect())
+ {
+ QPDFObjGen og = page.getObjGen();
+ if (page_numbers.count(og))
+ {
+ j_destpage = JSON::makeInt(page_numbers[og]);
+ }
+ }
+ jo.addDictionaryMember("destpageposfrom1", j_destpage);
JSON j_kids = jo.addDictionaryMember("kids", JSON::makeArray());
- add_outlines_to_json(ol.getKids(), j_kids);
+ add_outlines_to_json(ol.getKids(), j_kids, page_numbers);
}
}
static void do_json_outlines(QPDF& pdf, Options& o, JSON& j)
{
+ std::map page_numbers;
+ QPDFPageDocumentHelper dh(pdf);
+ std::vector pages = dh.getAllPages();
+ int n = 0;
+ for (std::vector::iterator iter = pages.begin();
+ iter != pages.end(); ++iter)
+ {
+ QPDFObjectHandle oh = (*iter).getObjectHandle();
+ page_numbers[oh.getObjGen()] = ++n;
+ }
+
JSON j_outlines = j.addDictionaryMember(
"outlines", JSON::makeArray());
QPDFOutlineDocumentHelper odh(pdf);
- add_outlines_to_json(odh.getTopLevelOutlines(), j_outlines);
+ add_outlines_to_json(odh.getTopLevelOutlines(), j_outlines, page_numbers);
}
static void do_json(QPDF& pdf, Options& o)