Commit 03e27709f32ebc83b1c351da5c03ffb2d18f28da
1 parent
7ff234a9
Improve Unicode filename testing
Remove dependency on the behavior of perl for reliable creation of Unicode file names on Windows.
Showing
5 changed files
with
131 additions
and
3 deletions
TODO
| @@ -170,6 +170,14 @@ I find it useful to make reference to them in this list | @@ -170,6 +170,14 @@ I find it useful to make reference to them in this list | ||
| 170 | 170 | ||
| 171 | * Pl_TIFFPredictor is pretty slow. | 171 | * Pl_TIFFPredictor is pretty slow. |
| 172 | 172 | ||
| 173 | + * Support for handling file names with Unicode characters in Windows | ||
| 174 | + is incomplete. qpdf seems to support them okay from a functionality | ||
| 175 | + standpoint, and the right thing happens if you pass in UTF-8 | ||
| 176 | + encoded filenames to QPDF library routines in Windows (they are | ||
| 177 | + converted internally to wchar_t*), but file names are encoded in | ||
| 178 | + UTF-8 on output, which doesn't produce nice error messages or | ||
| 179 | + output on Windows in some cases. | ||
| 180 | + | ||
| 173 | * If we ever wanted to do anything more with character encoding, see | 181 | * If we ever wanted to do anything more with character encoding, see |
| 174 | ../misc/character-encoding/, which includes machine-readable dump | 182 | ../misc/character-encoding/, which includes machine-readable dump |
| 175 | of table D.2 in the ISO-32000 PDF spec. This shows the mapping | 183 | of table D.2 in the ISO-32000 PDF spec. This shows the mapping |
manual/qpdf-manual.xml
| @@ -2612,6 +2612,31 @@ outfile.pdf</option> | @@ -2612,6 +2612,31 @@ outfile.pdf</option> | ||
| 2612 | </varlistentry> | 2612 | </varlistentry> |
| 2613 | </variablelist> | 2613 | </variablelist> |
| 2614 | </sect1> | 2614 | </sect1> |
| 2615 | + <sect1 id="ref.unicode-files"> | ||
| 2616 | + <title>A Note About Unicode File Names</title> | ||
| 2617 | + <para> | ||
| 2618 | + When strings are passed to qpdf library routines either as | ||
| 2619 | + <literal>char*</literal> or as <literal>std::string</literal>, | ||
| 2620 | + they are treated as byte arrays except where otherwise noted. When | ||
| 2621 | + Unicode is desired, qpdf wants UTF-8 unless otherwise noted in | ||
| 2622 | + comments in header files. In modern UNIX/Linux environments, this | ||
| 2623 | + generally does the right thing. In Windows, it's a bit more | ||
| 2624 | + complicated. Starting in qpdf 8.4.0, passwords that contain | ||
| 2625 | + Unicode characters are handled much better, and starting in qpdf | ||
| 2626 | + 8.4.1, the library attempts to properly handle Unicode characters | ||
| 2627 | + in filenames. In particular, in Windows, if a UTF-8 encoded string | ||
| 2628 | + is used as a filename in either <classname>QPDF</classname> or | ||
| 2629 | + <classname>QPDFWriter</classname>, it is internally converted to | ||
| 2630 | + <literal>wchar_t*</literal>, and Unicode-aware Windows APIs are | ||
| 2631 | + used. As such, qpdf will generally operate properly on files with | ||
| 2632 | + non-ASCII characters in their names as long as the filenames are | ||
| 2633 | + UTF-8 encoded for passing into the qpdf library API, but there are | ||
| 2634 | + still some rough edges, such as the encoding of the filenames in | ||
| 2635 | + error messages our CLI output messages. Patches or bug reports are | ||
| 2636 | + welcome for any continuing issues with Unicode file names in | ||
| 2637 | + Windows. | ||
| 2638 | + </para> | ||
| 2639 | + </sect1> | ||
| 2615 | </chapter> | 2640 | </chapter> |
| 2616 | <chapter id="ref.json"> | 2641 | <chapter id="ref.json"> |
| 2617 | <title>QPDF JSON</title> | 2642 | <title>QPDF JSON</title> |
qpdf/build.mk
| @@ -5,7 +5,8 @@ BINS_qpdf = \ | @@ -5,7 +5,8 @@ BINS_qpdf = \ | ||
| 5 | test_large_file \ | 5 | test_large_file \ |
| 6 | test_pdf_doc_encoding \ | 6 | test_pdf_doc_encoding \ |
| 7 | test_pdf_unicode \ | 7 | test_pdf_unicode \ |
| 8 | - test_tokenizer | 8 | + test_tokenizer \ |
| 9 | + test_unicode_filenames | ||
| 9 | CBINS_qpdf = qpdf-ctest | 10 | CBINS_qpdf = qpdf-ctest |
| 10 | 11 | ||
| 11 | TARGETS_qpdf = $(foreach B,$(BINS_qpdf) $(CBINS_qpdf),qpdf/$(OUTPUT_DIR)/$(call binname,$(B))) | 12 | TARGETS_qpdf = $(foreach B,$(BINS_qpdf) $(CBINS_qpdf),qpdf/$(OUTPUT_DIR)/$(call binname,$(B))) |
| @@ -20,6 +21,8 @@ TC_SRCS_qpdf = $(wildcard libqpdf/*.cc) $(wildcard qpdf/*.cc) | @@ -20,6 +21,8 @@ TC_SRCS_qpdf = $(wildcard libqpdf/*.cc) $(wildcard qpdf/*.cc) | ||
| 20 | 21 | ||
| 21 | XCXXFLAGS_qpdf_qpdf := $(WINDOWS_WMAIN_COMPILE) | 22 | XCXXFLAGS_qpdf_qpdf := $(WINDOWS_WMAIN_COMPILE) |
| 22 | XLDFLAGS_qpdf_qpdf := $(WINDOWS_WMAIN_LINK) | 23 | XLDFLAGS_qpdf_qpdf := $(WINDOWS_WMAIN_LINK) |
| 24 | +XCXXFLAGS_qpdf_test_unicode_filenames := $(WINDOWS_WMAIN_COMPILE) | ||
| 25 | +XLDFLAGS_qpdf_test_unicode_filenames := $(WINDOWS_WMAIN_LINK) | ||
| 23 | 26 | ||
| 24 | $(foreach B,$(BINS_qpdf),$(eval \ | 27 | $(foreach B,$(BINS_qpdf),$(eval \ |
| 25 | OBJS_$(B) = $(call src_to_obj,qpdf/$(B).cc))) | 28 | OBJS_$(B) = $(call src_to_obj,qpdf/$(B).cc))) |
qpdf/qtest/qpdf.test
| @@ -135,7 +135,7 @@ foreach my $c (@completion_tests) | @@ -135,7 +135,7 @@ foreach my $c (@completion_tests) | ||
| 135 | show_ntests(); | 135 | show_ntests(); |
| 136 | # ---------- | 136 | # ---------- |
| 137 | $td->notify("--- Argument Parsing ---"); | 137 | $td->notify("--- Argument Parsing ---"); |
| 138 | -$n_tests += 8; | 138 | +$n_tests += 6; |
| 139 | 139 | ||
| 140 | $td->runtest("required argument", | 140 | $td->runtest("required argument", |
| 141 | {$td->COMMAND => "qpdf --password minimal.pdf"}, | 141 | {$td->COMMAND => "qpdf --password minimal.pdf"}, |
| @@ -167,10 +167,21 @@ $td->runtest("extra overlay filename", | @@ -167,10 +167,21 @@ $td->runtest("extra overlay filename", | ||
| 167 | {$td->REGEXP => ".*overlay file already specified.*", | 167 | {$td->REGEXP => ".*overlay file already specified.*", |
| 168 | $td->EXIT_STATUS => 2}, | 168 | $td->EXIT_STATUS => 2}, |
| 169 | $td->NORMALIZE_NEWLINES); | 169 | $td->NORMALIZE_NEWLINES); |
| 170 | + | ||
| 171 | +show_ntests(); | ||
| 172 | +# ---------- | ||
| 173 | +$td->notify("--- Unicode Filenames ---"); | ||
| 174 | +$n_tests += 3; | ||
| 175 | + | ||
| 176 | +$td->runtest("create unicode filenames", | ||
| 177 | + {$td->COMMAND => "test_unicode_filenames"}, | ||
| 178 | + {$td->STRING => "created Unicode filenames\n", | ||
| 179 | + $td->EXIT_STATUS => 0}, | ||
| 180 | + $td->NORMALIZE_NEWLINES); | ||
| 181 | + | ||
| 170 | foreach my $d (['auto-รผ', 1], ['auto-รถฯ', 2]) | 182 | foreach my $d (['auto-รผ', 1], ['auto-รถฯ', 2]) |
| 171 | { | 183 | { |
| 172 | my ($u, $n) = @$d; | 184 | my ($u, $n) = @$d; |
| 173 | - copy('minimal.pdf', "$u.pdf"); | ||
| 174 | $td->runtest("unicode filename $u", | 185 | $td->runtest("unicode filename $u", |
| 175 | {$td->COMMAND => "qpdf --check $u.pdf"}, | 186 | {$td->COMMAND => "qpdf --check $u.pdf"}, |
| 176 | {$td->FILE => "check-unicode-filename-$n.out", | 187 | {$td->FILE => "check-unicode-filename-$n.out", |
qpdf/test_unicode_filenames.cc
0 โ 100644
| 1 | +#ifdef _WIN32 | ||
| 2 | +#include <windows.h> | ||
| 3 | +#include <direct.h> | ||
| 4 | +#include <io.h> | ||
| 5 | +#endif | ||
| 6 | + | ||
| 7 | +#include <iostream> | ||
| 8 | +#include <stdlib.h> | ||
| 9 | +#include <stdio.h> | ||
| 10 | + | ||
| 11 | +static void do_copy(FILE* in, FILE* out) | ||
| 12 | +{ | ||
| 13 | + if ((in == 0) || (out == 0)) | ||
| 14 | + { | ||
| 15 | + std::cerr << "errors opening files" << std::endl; | ||
| 16 | + exit(2); | ||
| 17 | + } | ||
| 18 | + char buf[10240]; | ||
| 19 | + size_t len = 0; | ||
| 20 | + while ((len = fread(buf, 1, sizeof(buf), in)) > 0) | ||
| 21 | + { | ||
| 22 | + fwrite(buf, 1, len, out); | ||
| 23 | + } | ||
| 24 | + if (len != 0) | ||
| 25 | + { | ||
| 26 | + std::cerr << "errors reading or writing" << std::endl; | ||
| 27 | + exit(2); | ||
| 28 | + } | ||
| 29 | + fclose(in); | ||
| 30 | + fclose(out); | ||
| 31 | +} | ||
| 32 | + | ||
| 33 | +#ifdef WINDOWS_WMAIN | ||
| 34 | + | ||
| 35 | +void copy(wchar_t const* outname) | ||
| 36 | +{ | ||
| 37 | +#ifdef _MSC_VER | ||
| 38 | + FILE* in = 0; | ||
| 39 | + _wfopen_s(&in, L"minimal.pdf", L"rb"); | ||
| 40 | + FILE* out = 0; | ||
| 41 | + _wfopen_s(&out, outname, L"wb"); | ||
| 42 | +#else | ||
| 43 | + FILE* in = _wfopen(L"minimal.pdf", L"rb"); | ||
| 44 | + FILE* out = _wfopen(outname, L"wb"); | ||
| 45 | +#endif | ||
| 46 | + do_copy(in, out); | ||
| 47 | +} | ||
| 48 | + | ||
| 49 | +extern "C" | ||
| 50 | +int wmain(int argc, wchar_t* argv[]) | ||
| 51 | +{ | ||
| 52 | + // Unicode | ||
| 53 | + wchar_t const* f1 = L"auto-\xfc.pdf"; | ||
| 54 | + wchar_t const* f2 = L"auto-\xf6\x03c0.pdf"; | ||
| 55 | + copy(f1); | ||
| 56 | + copy(f2); | ||
| 57 | + std::cout << "created Unicode filenames" << std::endl; | ||
| 58 | + return 0; | ||
| 59 | +} | ||
| 60 | + | ||
| 61 | +#else | ||
| 62 | + | ||
| 63 | +void copy(char const* outname) | ||
| 64 | +{ | ||
| 65 | + FILE* in = fopen("minimal.pdf", "rb"); | ||
| 66 | + FILE* out = fopen(outname, "wb"); | ||
| 67 | + do_copy(in, out); | ||
| 68 | +} | ||
| 69 | + | ||
| 70 | +int main(int argc, char* argv[]) | ||
| 71 | +{ | ||
| 72 | + // Explicit UTF-8 encoding | ||
| 73 | + char const* f1 = "auto-\xc3\xbc.pdf"; | ||
| 74 | + char const* f2 = "auto-\xc3\xb6\xcf\x80.pdf"; | ||
| 75 | + copy(f1); | ||
| 76 | + copy(f2); | ||
| 77 | + std::cout << "created Unicode filenames" << std::endl; | ||
| 78 | + return 0; | ||
| 79 | +} | ||
| 80 | + | ||
| 81 | +#endif |