Commit 5bbb0d4c307bff58e9928a1c757438d033687ce3

Authored by Jay Berkenbilt
1 parent 37f05e67

Replace switch statements with static map initializers

Character transcoding from Unicode to single-byte characters used
hard-coded switch statements because the code predated our adoption of
C++11. Now we have thread-safe, static initialization of map literals,
so use that instead.
Showing 2 changed files with 72 additions and 574 deletions
... ... @@ -11,9 +11,6 @@ In order:
11 11 Other (do in any order):
12 12  
13 13 Misc
14   -* Get rid of "ugly switch statements" in QUtil.cc -- replace with
15   - static map initializers. (Search for "ugly switch statements" below
16   - as well.)
17 14 * Consider exposing get_next_utf8_codepoint in QUtil
18 15 * Add QUtil::is_explicit_utf8 that does what QPDF_String::getUTF8Val
19 16 does to detect UTF-8 encoded strings per PDF 2.0 spec.
... ... @@ -396,10 +393,9 @@ we might do about it.
396 393 * When mapping characters to widths, we will need to care about
397 394 character encoding. For built-in fonts, we can create a map from
398 395 Unicode code point to width and then go from the font's encoding to
399   - unicode to the width. Get rid of "ugly switch statements" in
400   - QUtil.cc and replace with static map initializers. See
401   - misc/character-encoding/ (not on github) and font metric information
402   - for the 14 standard fonts in my local pdf-spec directory.
  396 + unicode to the width. See misc/character-encoding/ (not on github)
  397 + and font metric information for the 14 standard fonts in my local
  398 + pdf-spec directory.
403 399  
404 400 * Once we know about character widths, we can correctly support
405 401 auto-sized variable text fields (0 Tf). If this is fixed, search for
... ...
libqpdf/QUtil.cc
... ... @@ -16,6 +16,7 @@
16 16 #include <fstream>
17 17 #include <iomanip>
18 18 #include <locale>
  19 +#include <map>
19 20 #include <memory>
20 21 #include <regex>
21 22 #include <set>
... ... @@ -251,6 +252,59 @@ static unsigned short mac_roman_to_unicode[] = {
251 252 0x02c7, // 0xff
252 253 };
253 254  
  255 +static std::map<unsigned long, unsigned char> unicode_to_win_ansi = {
  256 + {0x20ac, 0x80}, {0x201a, 0x82}, {0x192, 0x83}, {0x201e, 0x84},
  257 + {0x2026, 0x85}, {0x2020, 0x86}, {0x2021, 0x87}, {0x2c6, 0x88},
  258 + {0x2030, 0x89}, {0x160, 0x8a}, {0x2039, 0x8b}, {0x152, 0x8c},
  259 + {0x17d, 0x8e}, {0x2018, 0x91}, {0x2019, 0x92}, {0x201c, 0x93},
  260 + {0x201d, 0x94}, {0x2022, 0x95}, {0x2013, 0x96}, {0x2014, 0x97},
  261 + {0x303, 0x98}, {0x2122, 0x99}, {0x161, 0x9a}, {0x203a, 0x9b},
  262 + {0x153, 0x9c}, {0x17e, 0x9e}, {0x178, 0x9f}, {0xa0, 0xa0},
  263 +};
  264 +static std::map<unsigned long, unsigned char> unicode_to_mac_roman = {
  265 + {0xc4, 0x80}, {0xc5, 0x81}, {0xc7, 0x82}, {0xc9, 0x83},
  266 + {0xd1, 0x84}, {0xd6, 0x85}, {0xdc, 0x86}, {0xe1, 0x87},
  267 + {0xe0, 0x88}, {0xe2, 0x89}, {0xe4, 0x8a}, {0xe3, 0x8b},
  268 + {0xe5, 0x8c}, {0xe7, 0x8d}, {0xe9, 0x8e}, {0xe8, 0x8f},
  269 + {0xea, 0x90}, {0xeb, 0x91}, {0xed, 0x92}, {0xec, 0x93},
  270 + {0xee, 0x94}, {0xef, 0x95}, {0xf1, 0x96}, {0xf3, 0x97},
  271 + {0xf2, 0x98}, {0xf4, 0x99}, {0xf6, 0x9a}, {0xf5, 0x9b},
  272 + {0xfa, 0x9c}, {0xf9, 0x9d}, {0xfb, 0x9e}, {0xfc, 0x9f},
  273 + {0x2020, 0xa0}, {0xb0, 0xa1}, {0xa2, 0xa2}, {0xa3, 0xa3},
  274 + {0xa7, 0xa4}, {0x2022, 0xa5}, {0xb6, 0xa6}, {0xdf, 0xa7},
  275 + {0xae, 0xa8}, {0xa9, 0xa9}, {0x2122, 0xaa}, {0x301, 0xab},
  276 + {0x308, 0xac}, {0xc6, 0xae}, {0xd8, 0xaf}, {0xb1, 0xb1},
  277 + {0xa5, 0xb4}, {0x3bc, 0xb5}, {0x1d43, 0xbb}, {0x1d52, 0xbc},
  278 + {0xe6, 0xbe}, {0xf8, 0xbf}, {0xbf, 0xc0}, {0xa1, 0xc1},
  279 + {0xac, 0xc2}, {0x192, 0xc4}, {0xab, 0xc7}, {0xbb, 0xc8},
  280 + {0x2026, 0xc9}, {0xc0, 0xcb}, {0xc3, 0xcc}, {0xd5, 0xcd},
  281 + {0x152, 0xce}, {0x153, 0xcf}, {0x2013, 0xd0}, {0x2014, 0xd1},
  282 + {0x201c, 0xd2}, {0x201d, 0xd3}, {0x2018, 0xd4}, {0x2019, 0xd5},
  283 + {0xf7, 0xd6}, {0xff, 0xd8}, {0x178, 0xd9}, {0x2044, 0xda},
  284 + {0xa4, 0xdb}, {0x2039, 0xdc}, {0x203a, 0xdd}, {0xfb01, 0xde},
  285 + {0xfb02, 0xdf}, {0x2021, 0xe0}, {0xb7, 0xe1}, {0x201a, 0xe2},
  286 + {0x201e, 0xe3}, {0x2030, 0xe4}, {0xc2, 0xe5}, {0xca, 0xe6},
  287 + {0xc1, 0xe7}, {0xcb, 0xe8}, {0xc8, 0xe9}, {0xcd, 0xea},
  288 + {0xce, 0xeb}, {0xcf, 0xec}, {0xcc, 0xed}, {0xd3, 0xee},
  289 + {0xd4, 0xef}, {0xd2, 0xf1}, {0xda, 0xf2}, {0xdb, 0xf3},
  290 + {0xd9, 0xf4}, {0x131, 0xf5}, {0x2c6, 0xf6}, {0x303, 0xf7},
  291 + {0x304, 0xf8}, {0x306, 0xf9}, {0x307, 0xfa}, {0x30a, 0xfb},
  292 + {0x327, 0xfc}, {0x30b, 0xfd}, {0x328, 0xfe}, {0x2c7, 0xff},
  293 +};
  294 +static std::map<unsigned long, unsigned char> unicode_to_pdf_doc = {
  295 + {0x02d8, 0x18}, {0x02c7, 0x19}, {0x02c6, 0x1a}, {0x02d9, 0x1b},
  296 + {0x02dd, 0x1c}, {0x02db, 0x1d}, {0x02da, 0x1e}, {0x02dc, 0x1f},
  297 + {0x2022, 0x80}, {0x2020, 0x81}, {0x2021, 0x82}, {0x2026, 0x83},
  298 + {0x2014, 0x84}, {0x2013, 0x85}, {0x0192, 0x86}, {0x2044, 0x87},
  299 + {0x2039, 0x88}, {0x203a, 0x89}, {0x2212, 0x8a}, {0x2030, 0x8b},
  300 + {0x201e, 0x8c}, {0x201c, 0x8d}, {0x201d, 0x8e}, {0x2018, 0x8f},
  301 + {0x2019, 0x90}, {0x201a, 0x91}, {0x2122, 0x92}, {0xfb01, 0x93},
  302 + {0xfb02, 0x94}, {0x0141, 0x95}, {0x0152, 0x96}, {0x0160, 0x97},
  303 + {0x0178, 0x98}, {0x017d, 0x99}, {0x0131, 0x9a}, {0x0142, 0x9b},
  304 + {0x0153, 0x9c}, {0x0161, 0x9d}, {0x017e, 0x9e}, {0xfffd, 0x9f},
  305 + {0x20ac, 0xa0},
  306 +};
  307 +
254 308 namespace
255 309 {
256 310 class FileCloser
... ... @@ -1447,583 +1501,31 @@ enum encoding_e { e_utf16, e_ascii, e_winansi, e_macroman, e_pdfdoc };
1447 1501 static unsigned char
1448 1502 encode_winansi(unsigned long codepoint)
1449 1503 {
1450   - // Use this ugly switch statement to avoid a static, which is not
1451   - // thread-safe.
1452   - unsigned char ch = '\0';
1453   - switch (codepoint) {
1454   - case 0x20ac:
1455   - ch = 0x80;
1456   - break;
1457   - case 0x201a:
1458   - ch = 0x82;
1459   - break;
1460   - case 0x192:
1461   - ch = 0x83;
1462   - break;
1463   - case 0x201e:
1464   - ch = 0x84;
1465   - break;
1466   - case 0x2026:
1467   - ch = 0x85;
1468   - break;
1469   - case 0x2020:
1470   - ch = 0x86;
1471   - break;
1472   - case 0x2021:
1473   - ch = 0x87;
1474   - break;
1475   - case 0x2c6:
1476   - ch = 0x88;
1477   - break;
1478   - case 0x2030:
1479   - ch = 0x89;
1480   - break;
1481   - case 0x160:
1482   - ch = 0x8a;
1483   - break;
1484   - case 0x2039:
1485   - ch = 0x8b;
1486   - break;
1487   - case 0x152:
1488   - ch = 0x8c;
1489   - break;
1490   - case 0x17d:
1491   - ch = 0x8e;
1492   - break;
1493   - case 0x2018:
1494   - ch = 0x91;
1495   - break;
1496   - case 0x2019:
1497   - ch = 0x92;
1498   - break;
1499   - case 0x201c:
1500   - ch = 0x93;
1501   - break;
1502   - case 0x201d:
1503   - ch = 0x94;
1504   - break;
1505   - case 0x2022:
1506   - ch = 0x95;
1507   - break;
1508   - case 0x2013:
1509   - ch = 0x96;
1510   - break;
1511   - case 0x2014:
1512   - ch = 0x97;
1513   - break;
1514   - case 0x303:
1515   - ch = 0x98;
1516   - break;
1517   - case 0x2122:
1518   - ch = 0x99;
1519   - break;
1520   - case 0x161:
1521   - ch = 0x9a;
1522   - break;
1523   - case 0x203a:
1524   - ch = 0x9b;
1525   - break;
1526   - case 0x153:
1527   - ch = 0x9c;
1528   - break;
1529   - case 0x17e:
1530   - ch = 0x9e;
1531   - break;
1532   - case 0x178:
1533   - ch = 0x9f;
1534   - break;
1535   - case 0xa0:
1536   - ch = 0xa0;
1537   - break;
1538   - default:
1539   - break;
1540   - }
1541   - return ch;
  1504 + auto i = unicode_to_win_ansi.find(codepoint);
  1505 + if (i != unicode_to_win_ansi.end()) {
  1506 + return i->second;
  1507 + }
  1508 + return '\0';
1542 1509 }
1543 1510  
1544 1511 static unsigned char
1545 1512 encode_macroman(unsigned long codepoint)
1546 1513 {
1547   - // Use this ugly switch statement to avoid a static, which is not
1548   - // thread-safe.
1549   - unsigned char ch = '\0';
1550   - switch (codepoint) {
1551   - case 0xc4:
1552   - ch = 0x80;
1553   - break;
1554   - case 0xc5:
1555   - ch = 0x81;
1556   - break;
1557   - case 0xc7:
1558   - ch = 0x82;
1559   - break;
1560   - case 0xc9:
1561   - ch = 0x83;
1562   - break;
1563   - case 0xd1:
1564   - ch = 0x84;
1565   - break;
1566   - case 0xd6:
1567   - ch = 0x85;
1568   - break;
1569   - case 0xdc:
1570   - ch = 0x86;
1571   - break;
1572   - case 0xe1:
1573   - ch = 0x87;
1574   - break;
1575   - case 0xe0:
1576   - ch = 0x88;
1577   - break;
1578   - case 0xe2:
1579   - ch = 0x89;
1580   - break;
1581   - case 0xe4:
1582   - ch = 0x8a;
1583   - break;
1584   - case 0xe3:
1585   - ch = 0x8b;
1586   - break;
1587   - case 0xe5:
1588   - ch = 0x8c;
1589   - break;
1590   - case 0xe7:
1591   - ch = 0x8d;
1592   - break;
1593   - case 0xe9:
1594   - ch = 0x8e;
1595   - break;
1596   - case 0xe8:
1597   - ch = 0x8f;
1598   - break;
1599   - case 0xea:
1600   - ch = 0x90;
1601   - break;
1602   - case 0xeb:
1603   - ch = 0x91;
1604   - break;
1605   - case 0xed:
1606   - ch = 0x92;
1607   - break;
1608   - case 0xec:
1609   - ch = 0x93;
1610   - break;
1611   - case 0xee:
1612   - ch = 0x94;
1613   - break;
1614   - case 0xef:
1615   - ch = 0x95;
1616   - break;
1617   - case 0xf1:
1618   - ch = 0x96;
1619   - break;
1620   - case 0xf3:
1621   - ch = 0x97;
1622   - break;
1623   - case 0xf2:
1624   - ch = 0x98;
1625   - break;
1626   - case 0xf4:
1627   - ch = 0x99;
1628   - break;
1629   - case 0xf6:
1630   - ch = 0x9a;
1631   - break;
1632   - case 0xf5:
1633   - ch = 0x9b;
1634   - break;
1635   - case 0xfa:
1636   - ch = 0x9c;
1637   - break;
1638   - case 0xf9:
1639   - ch = 0x9d;
1640   - break;
1641   - case 0xfb:
1642   - ch = 0x9e;
1643   - break;
1644   - case 0xfc:
1645   - ch = 0x9f;
1646   - break;
1647   - case 0x2020:
1648   - ch = 0xa0;
1649   - break;
1650   - case 0xb0:
1651   - ch = 0xa1;
1652   - break;
1653   - case 0xa2:
1654   - ch = 0xa2;
1655   - break;
1656   - case 0xa3:
1657   - ch = 0xa3;
1658   - break;
1659   - case 0xa7:
1660   - ch = 0xa4;
1661   - break;
1662   - case 0x2022:
1663   - ch = 0xa5;
1664   - break;
1665   - case 0xb6:
1666   - ch = 0xa6;
1667   - break;
1668   - case 0xdf:
1669   - ch = 0xa7;
1670   - break;
1671   - case 0xae:
1672   - ch = 0xa8;
1673   - break;
1674   - case 0xa9:
1675   - ch = 0xa9;
1676   - break;
1677   - case 0x2122:
1678   - ch = 0xaa;
1679   - break;
1680   - case 0x301:
1681   - ch = 0xab;
1682   - break;
1683   - case 0x308:
1684   - ch = 0xac;
1685   - break;
1686   - case 0xc6:
1687   - ch = 0xae;
1688   - break;
1689   - case 0xd8:
1690   - ch = 0xaf;
1691   - break;
1692   - case 0xb1:
1693   - ch = 0xb1;
1694   - break;
1695   - case 0xa5:
1696   - ch = 0xb4;
1697   - break;
1698   - case 0x3bc:
1699   - ch = 0xb5;
1700   - break;
1701   - case 0x1d43:
1702   - ch = 0xbb;
1703   - break;
1704   - case 0x1d52:
1705   - ch = 0xbc;
1706   - break;
1707   - case 0xe6:
1708   - ch = 0xbe;
1709   - break;
1710   - case 0xf8:
1711   - ch = 0xbf;
1712   - break;
1713   - case 0xbf:
1714   - ch = 0xc0;
1715   - break;
1716   - case 0xa1:
1717   - ch = 0xc1;
1718   - break;
1719   - case 0xac:
1720   - ch = 0xc2;
1721   - break;
1722   - case 0x192:
1723   - ch = 0xc4;
1724   - break;
1725   - case 0xab:
1726   - ch = 0xc7;
1727   - break;
1728   - case 0xbb:
1729   - ch = 0xc8;
1730   - break;
1731   - case 0x2026:
1732   - ch = 0xc9;
1733   - break;
1734   - case 0xc0:
1735   - ch = 0xcb;
1736   - break;
1737   - case 0xc3:
1738   - ch = 0xcc;
1739   - break;
1740   - case 0xd5:
1741   - ch = 0xcd;
1742   - break;
1743   - case 0x152:
1744   - ch = 0xce;
1745   - break;
1746   - case 0x153:
1747   - ch = 0xcf;
1748   - break;
1749   - case 0x2013:
1750   - ch = 0xd0;
1751   - break;
1752   - case 0x2014:
1753   - ch = 0xd1;
1754   - break;
1755   - case 0x201c:
1756   - ch = 0xd2;
1757   - break;
1758   - case 0x201d:
1759   - ch = 0xd3;
1760   - break;
1761   - case 0x2018:
1762   - ch = 0xd4;
1763   - break;
1764   - case 0x2019:
1765   - ch = 0xd5;
1766   - break;
1767   - case 0xf7:
1768   - ch = 0xd6;
1769   - break;
1770   - case 0xff:
1771   - ch = 0xd8;
1772   - break;
1773   - case 0x178:
1774   - ch = 0xd9;
1775   - break;
1776   - case 0x2044:
1777   - ch = 0xda;
1778   - break;
1779   - case 0xa4:
1780   - ch = 0xdb;
1781   - break;
1782   - case 0x2039:
1783   - ch = 0xdc;
1784   - break;
1785   - case 0x203a:
1786   - ch = 0xdd;
1787   - break;
1788   - case 0xfb01:
1789   - ch = 0xde;
1790   - break;
1791   - case 0xfb02:
1792   - ch = 0xdf;
1793   - break;
1794   - case 0x2021:
1795   - ch = 0xe0;
1796   - break;
1797   - case 0xb7:
1798   - ch = 0xe1;
1799   - break;
1800   - case 0x201a:
1801   - ch = 0xe2;
1802   - break;
1803   - case 0x201e:
1804   - ch = 0xe3;
1805   - break;
1806   - case 0x2030:
1807   - ch = 0xe4;
1808   - break;
1809   - case 0xc2:
1810   - ch = 0xe5;
1811   - break;
1812   - case 0xca:
1813   - ch = 0xe6;
1814   - break;
1815   - case 0xc1:
1816   - ch = 0xe7;
1817   - break;
1818   - case 0xcb:
1819   - ch = 0xe8;
1820   - break;
1821   - case 0xc8:
1822   - ch = 0xe9;
1823   - break;
1824   - case 0xcd:
1825   - ch = 0xea;
1826   - break;
1827   - case 0xce:
1828   - ch = 0xeb;
1829   - break;
1830   - case 0xcf:
1831   - ch = 0xec;
1832   - break;
1833   - case 0xcc:
1834   - ch = 0xed;
1835   - break;
1836   - case 0xd3:
1837   - ch = 0xee;
1838   - break;
1839   - case 0xd4:
1840   - ch = 0xef;
1841   - break;
1842   - case 0xd2:
1843   - ch = 0xf1;
1844   - break;
1845   - case 0xda:
1846   - ch = 0xf2;
1847   - break;
1848   - case 0xdb:
1849   - ch = 0xf3;
1850   - break;
1851   - case 0xd9:
1852   - ch = 0xf4;
1853   - break;
1854   - case 0x131:
1855   - ch = 0xf5;
1856   - break;
1857   - case 0x2c6:
1858   - ch = 0xf6;
1859   - break;
1860   - case 0x303:
1861   - ch = 0xf7;
1862   - break;
1863   - case 0x304:
1864   - ch = 0xf8;
1865   - break;
1866   - case 0x306:
1867   - ch = 0xf9;
1868   - break;
1869   - case 0x307:
1870   - ch = 0xfa;
1871   - break;
1872   - case 0x30a:
1873   - ch = 0xfb;
1874   - break;
1875   - case 0x327:
1876   - ch = 0xfc;
1877   - break;
1878   - case 0x30b:
1879   - ch = 0xfd;
1880   - break;
1881   - case 0x328:
1882   - ch = 0xfe;
1883   - break;
1884   - case 0x2c7:
1885   - ch = 0xff;
1886   - break;
1887   - default:
1888   - break;
1889   - }
1890   - return ch;
  1514 + auto i = unicode_to_mac_roman.find(codepoint);
  1515 + if (i != unicode_to_mac_roman.end()) {
  1516 + return i->second;
  1517 + }
  1518 + return '\0';
1891 1519 }
1892 1520  
1893 1521 static unsigned char
1894 1522 encode_pdfdoc(unsigned long codepoint)
1895 1523 {
1896   - // Use this ugly switch statement to avoid a static, which is not
1897   - // thread-safe.
1898   - unsigned char ch = '\0';
1899   - switch (codepoint) {
1900   - case 0x02d8:
1901   - ch = 0x18;
1902   - break;
1903   - case 0x02c7:
1904   - ch = 0x19;
1905   - break;
1906   - case 0x02c6:
1907   - ch = 0x1a;
1908   - break;
1909   - case 0x02d9:
1910   - ch = 0x1b;
1911   - break;
1912   - case 0x02dd:
1913   - ch = 0x1c;
1914   - break;
1915   - case 0x02db:
1916   - ch = 0x1d;
1917   - break;
1918   - case 0x02da:
1919   - ch = 0x1e;
1920   - break;
1921   - case 0x02dc:
1922   - ch = 0x1f;
1923   - break;
1924   - case 0x2022:
1925   - ch = 0x80;
1926   - break;
1927   - case 0x2020:
1928   - ch = 0x81;
1929   - break;
1930   - case 0x2021:
1931   - ch = 0x82;
1932   - break;
1933   - case 0x2026:
1934   - ch = 0x83;
1935   - break;
1936   - case 0x2014:
1937   - ch = 0x84;
1938   - break;
1939   - case 0x2013:
1940   - ch = 0x85;
1941   - break;
1942   - case 0x0192:
1943   - ch = 0x86;
1944   - break;
1945   - case 0x2044:
1946   - ch = 0x87;
1947   - break;
1948   - case 0x2039:
1949   - ch = 0x88;
1950   - break;
1951   - case 0x203a:
1952   - ch = 0x89;
1953   - break;
1954   - case 0x2212:
1955   - ch = 0x8a;
1956   - break;
1957   - case 0x2030:
1958   - ch = 0x8b;
1959   - break;
1960   - case 0x201e:
1961   - ch = 0x8c;
1962   - break;
1963   - case 0x201c:
1964   - ch = 0x8d;
1965   - break;
1966   - case 0x201d:
1967   - ch = 0x8e;
1968   - break;
1969   - case 0x2018:
1970   - ch = 0x8f;
1971   - break;
1972   - case 0x2019:
1973   - ch = 0x90;
1974   - break;
1975   - case 0x201a:
1976   - ch = 0x91;
1977   - break;
1978   - case 0x2122:
1979   - ch = 0x92;
1980   - break;
1981   - case 0xfb01:
1982   - ch = 0x93;
1983   - break;
1984   - case 0xfb02:
1985   - ch = 0x94;
1986   - break;
1987   - case 0x0141:
1988   - ch = 0x95;
1989   - break;
1990   - case 0x0152:
1991   - ch = 0x96;
1992   - break;
1993   - case 0x0160:
1994   - ch = 0x97;
1995   - break;
1996   - case 0x0178:
1997   - ch = 0x98;
1998   - break;
1999   - case 0x017d:
2000   - ch = 0x99;
2001   - break;
2002   - case 0x0131:
2003   - ch = 0x9a;
2004   - break;
2005   - case 0x0142:
2006   - ch = 0x9b;
2007   - break;
2008   - case 0x0153:
2009   - ch = 0x9c;
2010   - break;
2011   - case 0x0161:
2012   - ch = 0x9d;
2013   - break;
2014   - case 0x017e:
2015   - ch = 0x9e;
2016   - break;
2017   - case 0xfffd:
2018   - ch = 0x9f;
2019   - break;
2020   - case 0x20ac:
2021   - ch = 0xa0;
2022   - break;
2023   - default:
2024   - break;
2025   - }
2026   - return ch;
  1524 + auto i = unicode_to_pdf_doc.find(codepoint);
  1525 + if (i != unicode_to_pdf_doc.end()) {
  1526 + return i->second;
  1527 + }
  1528 + return '\0';
2027 1529 }
2028 1530  
2029 1531 unsigned long
... ...