Commit 03e27709f32ebc83b1c351da5c03ffb2d18f28da

Authored by Jay Berkenbilt
1 parent 7ff234a9

Improve Unicode filename testing

Remove dependency on the behavior of perl for reliable creation of
Unicode file names on Windows.
... ... @@ -170,6 +170,14 @@ I find it useful to make reference to them in this list
170 170  
171 171 * Pl_TIFFPredictor is pretty slow.
172 172  
  173 + * Support for handling file names with Unicode characters in Windows
  174 + is incomplete. qpdf seems to support them okay from a functionality
  175 + standpoint, and the right thing happens if you pass in UTF-8
  176 + encoded filenames to QPDF library routines in Windows (they are
  177 + converted internally to wchar_t*), but file names are encoded in
  178 + UTF-8 on output, which doesn't produce nice error messages or
  179 + output on Windows in some cases.
  180 +
173 181 * If we ever wanted to do anything more with character encoding, see
174 182 ../misc/character-encoding/, which includes machine-readable dump
175 183 of table D.2 in the ISO-32000 PDF spec. This shows the mapping
... ...
manual/qpdf-manual.xml
... ... @@ -2612,6 +2612,31 @@ outfile.pdf</option>
2612 2612 </varlistentry>
2613 2613 </variablelist>
2614 2614 </sect1>
  2615 + <sect1 id="ref.unicode-files">
  2616 + <title>A Note About Unicode File Names</title>
  2617 + <para>
  2618 + When strings are passed to qpdf library routines either as
  2619 + <literal>char*</literal> or as <literal>std::string</literal>,
  2620 + they are treated as byte arrays except where otherwise noted. When
  2621 + Unicode is desired, qpdf wants UTF-8 unless otherwise noted in
  2622 + comments in header files. In modern UNIX/Linux environments, this
  2623 + generally does the right thing. In Windows, it's a bit more
  2624 + complicated. Starting in qpdf 8.4.0, passwords that contain
  2625 + Unicode characters are handled much better, and starting in qpdf
  2626 + 8.4.1, the library attempts to properly handle Unicode characters
  2627 + in filenames. In particular, in Windows, if a UTF-8 encoded string
  2628 + is used as a filename in either <classname>QPDF</classname> or
  2629 + <classname>QPDFWriter</classname>, it is internally converted to
  2630 + <literal>wchar_t*</literal>, and Unicode-aware Windows APIs are
  2631 + used. As such, qpdf will generally operate properly on files with
  2632 + non-ASCII characters in their names as long as the filenames are
  2633 + UTF-8 encoded for passing into the qpdf library API, but there are
  2634 + still some rough edges, such as the encoding of the filenames in
  2635 + error messages our CLI output messages. Patches or bug reports are
  2636 + welcome for any continuing issues with Unicode file names in
  2637 + Windows.
  2638 + </para>
  2639 + </sect1>
2615 2640 </chapter>
2616 2641 <chapter id="ref.json">
2617 2642 <title>QPDF JSON</title>
... ...
qpdf/build.mk
... ... @@ -5,7 +5,8 @@ BINS_qpdf = \
5 5 test_large_file \
6 6 test_pdf_doc_encoding \
7 7 test_pdf_unicode \
8   - test_tokenizer
  8 + test_tokenizer \
  9 + test_unicode_filenames
9 10 CBINS_qpdf = qpdf-ctest
10 11  
11 12 TARGETS_qpdf = $(foreach B,$(BINS_qpdf) $(CBINS_qpdf),qpdf/$(OUTPUT_DIR)/$(call binname,$(B)))
... ... @@ -20,6 +21,8 @@ TC_SRCS_qpdf = $(wildcard libqpdf/*.cc) $(wildcard qpdf/*.cc)
20 21  
21 22 XCXXFLAGS_qpdf_qpdf := $(WINDOWS_WMAIN_COMPILE)
22 23 XLDFLAGS_qpdf_qpdf := $(WINDOWS_WMAIN_LINK)
  24 +XCXXFLAGS_qpdf_test_unicode_filenames := $(WINDOWS_WMAIN_COMPILE)
  25 +XLDFLAGS_qpdf_test_unicode_filenames := $(WINDOWS_WMAIN_LINK)
23 26  
24 27 $(foreach B,$(BINS_qpdf),$(eval \
25 28 OBJS_$(B) = $(call src_to_obj,qpdf/$(B).cc)))
... ...
qpdf/qtest/qpdf.test
... ... @@ -135,7 +135,7 @@ foreach my $c (@completion_tests)
135 135 show_ntests();
136 136 # ----------
137 137 $td->notify("--- Argument Parsing ---");
138   -$n_tests += 8;
  138 +$n_tests += 6;
139 139  
140 140 $td->runtest("required argument",
141 141 {$td->COMMAND => "qpdf --password minimal.pdf"},
... ... @@ -167,10 +167,21 @@ $td-&gt;runtest(&quot;extra overlay filename&quot;,
167 167 {$td->REGEXP => ".*overlay file already specified.*",
168 168 $td->EXIT_STATUS => 2},
169 169 $td->NORMALIZE_NEWLINES);
  170 +
  171 +show_ntests();
  172 +# ----------
  173 +$td->notify("--- Unicode Filenames ---");
  174 +$n_tests += 3;
  175 +
  176 +$td->runtest("create unicode filenames",
  177 + {$td->COMMAND => "test_unicode_filenames"},
  178 + {$td->STRING => "created Unicode filenames\n",
  179 + $td->EXIT_STATUS => 0},
  180 + $td->NORMALIZE_NEWLINES);
  181 +
170 182 foreach my $d (['auto-ü', 1], ['auto-öπ', 2])
171 183 {
172 184 my ($u, $n) = @$d;
173   - copy('minimal.pdf', "$u.pdf");
174 185 $td->runtest("unicode filename $u",
175 186 {$td->COMMAND => "qpdf --check $u.pdf"},
176 187 {$td->FILE => "check-unicode-filename-$n.out",
... ...
qpdf/test_unicode_filenames.cc 0 → 100644
  1 +#ifdef _WIN32
  2 +#include <windows.h>
  3 +#include <direct.h>
  4 +#include <io.h>
  5 +#endif
  6 +
  7 +#include <iostream>
  8 +#include <stdlib.h>
  9 +#include <stdio.h>
  10 +
  11 +static void do_copy(FILE* in, FILE* out)
  12 +{
  13 + if ((in == 0) || (out == 0))
  14 + {
  15 + std::cerr << "errors opening files" << std::endl;
  16 + exit(2);
  17 + }
  18 + char buf[10240];
  19 + size_t len = 0;
  20 + while ((len = fread(buf, 1, sizeof(buf), in)) > 0)
  21 + {
  22 + fwrite(buf, 1, len, out);
  23 + }
  24 + if (len != 0)
  25 + {
  26 + std::cerr << "errors reading or writing" << std::endl;
  27 + exit(2);
  28 + }
  29 + fclose(in);
  30 + fclose(out);
  31 +}
  32 +
  33 +#ifdef WINDOWS_WMAIN
  34 +
  35 +void copy(wchar_t const* outname)
  36 +{
  37 +#ifdef _MSC_VER
  38 + FILE* in = 0;
  39 + _wfopen_s(&in, L"minimal.pdf", L"rb");
  40 + FILE* out = 0;
  41 + _wfopen_s(&out, outname, L"wb");
  42 +#else
  43 + FILE* in = _wfopen(L"minimal.pdf", L"rb");
  44 + FILE* out = _wfopen(outname, L"wb");
  45 +#endif
  46 + do_copy(in, out);
  47 +}
  48 +
  49 +extern "C"
  50 +int wmain(int argc, wchar_t* argv[])
  51 +{
  52 + // Unicode
  53 + wchar_t const* f1 = L"auto-\xfc.pdf";
  54 + wchar_t const* f2 = L"auto-\xf6\x03c0.pdf";
  55 + copy(f1);
  56 + copy(f2);
  57 + std::cout << "created Unicode filenames" << std::endl;
  58 + return 0;
  59 +}
  60 +
  61 +#else
  62 +
  63 +void copy(char const* outname)
  64 +{
  65 + FILE* in = fopen("minimal.pdf", "rb");
  66 + FILE* out = fopen(outname, "wb");
  67 + do_copy(in, out);
  68 +}
  69 +
  70 +int main(int argc, char* argv[])
  71 +{
  72 + // Explicit UTF-8 encoding
  73 + char const* f1 = "auto-\xc3\xbc.pdf";
  74 + char const* f2 = "auto-\xc3\xb6\xcf\x80.pdf";
  75 + copy(f1);
  76 + copy(f2);
  77 + std::cout << "created Unicode filenames" << std::endl;
  78 + return 0;
  79 +}
  80 +
  81 +#endif
... ...