generate_auto_job 54 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218
#!/usr/bin/env python3
import os
import sys
import argparse
import hashlib
import re
import yaml
import json
import filecmp
from contextlib import contextmanager

# The purpose of this code is to automatically generate various parts
# of the QPDFJob class. It is fairly complicated and extremely
# bespoke, so understanding it is important if modifications are to be
# made.

# Documentation of QPDFJob is divided among three places:
#
# * "HOW TO ADD A COMMAND-LINE ARGUMENT" in README-maintainer provides
#   a quick reminder for how to add a command-line argument
#
# * This file has a detailed explanation about how QPDFJob and
#   generate_auto_job work together
#
# * The manual ("QPDFJob Design" in qpdf-job.rst) discusses the design
#   approach, rationale, and evolution of QPDFJob.
#
# QPDFJob solved the problem of moving extensive functionality that
# lived in qpdf.cc into the library. The QPDFJob class consists of
# four major sections:
#
# * The run() method and its subsidiaries are responsible for
#   performing the actual operations on PDF files. This is implemented
#   in QPDFJob.cc
#
# * The nested Config class and the other classes it creates provide
#   an API for setting up a QPDFJob instance and correspond to the
#   command-line arguments of the qpdf executable. This is implemented
#   in QPDFJob_config.cc
#
# * The argument parsing code reads an argv array and calls
#   configuration methods. This is implemented in QPDFJob_argv.cc. The
#   argument parsing logic itself is implemented in the QPDFArgParser
#   class.
#
# * The job JSON handling code, which reads a QPDFJob JSON file and
#   calls configuration methods. This is implemented in
#   QPDFJob_json.cc. The JSON parsing code is in the JSON class. A
#   sax-like JSON handler class that calls callbacks in response to
#   items in the JSON is implemented in the JSONHandler class.
#
# This code has the job of ensuring that configuration, command-line
# arguments, and JSON are all consistent and complete so that a
# developer or user can freely move among those different ways of
# interacting with QPDFJob in a predictable fashion. In addition, help
# information for each option appears in manual/cli.rst, and that
# information is used in the creation of the job JSON schema and to supply
# help text to QPDFArgParser. This code also ensures that there is an
# exact match between options in job.yml and options in cli.rst.
#
# The job.yml file contains the data that drives this code. To
# understand job.yml, here are some important concepts.
#
# QPDFArgParser option table. There is support for positional
# arguments, options consisting of flags and optional parameters, and
# subparsers that start with a regular parameterless flag, have their
# own positional and option sections, and are terminated with -- by
# itself. Examples of this include --encrypt and --pages. An "option
# table" contains an optional positional argument handler and a list
# of valid options with specifications about their parameters. There
# are three kinds of option tables:
#
# * The built-in "help" option table contains help commands, like
#   --help and --version, that are only valid when they appear as the
#   single command-line argument.
#
# * The "main" option table contains the options that are valid
#   starting at the beginning of argument parsing.
#
# * A named option table can be started manually by the argument
#   parsing code to switch the argument parser's context. Switching
#   the parser to a new option table is manual (via a call to
#   selectOptionTable). Context reverts to the main option table
#   automatically when -- is encountered.
#
# In QPDFJob.hh, there is a Config class for each option table except
# help.
#
# Option type: bare, required/optional parameter, required/optional
# choices. A bare argument is just a flag, like --qdf. A parameter
# option takes an arbitrary parameter, like --password. A choices
# option takes one of a fixed list of choices, like --object-streams.
# If a parameter or choices option's parameter is option, the empty
# string may be specified as an option, such as --collate (or
# --collate=). For a bare option, --option= is always the same as just
# --option. This makes it possible to switch an option from bare to
# optional choice to optional parameter all without breaking
# compatibility.
#
# JSON "schema". This is a qpdf-specific "schema" for JSON. It is not
# related to any kind of standard JSON schema. It is described in
# JSON.hh and in the manual. QPDFJob uses the JSON "schema" in a mode
# in which keys in the schema are all optional in the JSON object.
#
# Here is the mapping between configuration, argv, and JSON.
#
# The help options table is implemented solely for argv processing and
# has no counterpart in configuration or JSON.
#
# The config() method returns a shared pointer to a Config object.
# Every command-line option in the main option table has a
# corresponding method in Config whose name is the option converted to
# camel case. For bare options and options with optional parameters, a
# version exists that takes no arguments. For other than bare options,
# a version exist, possibly in addition, that takes a std::string
# const&. For example, the --qdf flag implies a qdf() method in
# Config, and the --object-streams flag implies an
# objectStreams(std::string const&) method in Config. For flags in
# option tables, the method is declared inside a config class specific
# to the option table. The mapping between option tables and config
# classes is explicit in job.yml. Positional arguments are handled
# individually and manually -- see QPDFJob.hh in the CONFIGURATION
# section for details. See examples/qpdf-job.cc for an example.
#
# To understand the rest, start at main and follow comments in the
# code.

whoami = os.path.basename(sys.argv[0])
BANNER = f'''//
// This file is automatically generated by {whoami}.
// Edits will be automatically overwritten if the build is
// run in maintainer mode.
//
// clang-format off
//'''

MAN_BANNER = f'''.\\"
.\\" This file is automatically generated by {whoami}.
.\\" Edits will be automatically overwritten if the build is
.\\" run in maintainer mode.
.\\"
'''

def warn(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)


@contextmanager
def write_file(filename):
    tmpfile = filename + '.tmp'
    with open(tmpfile, 'w') as f:
        yield f
    if os.path.exists(filename) and filecmp.cmp(filename, tmpfile, False):
        os.unlink(tmpfile)
    else:
        os.rename(tmpfile, filename)


class Main:
    """
    Main class to manage generation of files for QPDFJob.

    The class provides logic to determine changes in input or generated files,
    update checksums, and facilitate file generation based on specified options.
    It utilizes checksums to avoid unnecessary file regenerations and manages
    source files, output destinations, and their checks in a build process.

    :ivar SOURCES: List of source files used as inputs.
    :ivar DESTS: Dictionary mapping file identifiers to their output destinations.
    :ivar SUMS: Filename of the checksum file for source and destination file
                checksums.
    """
    # SOURCES is a list of source files whose contents are used by
    # this program. If they change, we are out of date.
    SOURCES = [
        # Keep this list in sync with CMakeLists.txt: auto_job_inputs
        whoami,
        'CMakeLists.txt',
        'manual/_ext/qpdf.py',
        'job.yml',
        'manual/cli.rst',
        'manual/qpdf.1.in',
    ]
    # DESTS is a map to the output files this code generates. These
    # generated files, as well as those added to DESTS later in the
    # code, are included in various places by QPDFJob.hh or any of the
    # implementing QPDFJob*.cc files.
    DESTS = {
        # Keep this list in sync with CMakeLists.txt: auto_job_outputs
        'decl': 'libqpdf/qpdf/auto_job_decl.hh',
        'init': 'libqpdf/qpdf/auto_job_init.hh',
        'help': 'libqpdf/qpdf/auto_job_help.hh',
        'schema': 'libqpdf/qpdf/auto_job_schema.hh',
        'json_decl': 'libqpdf/qpdf/auto_job_json_decl.hh',
        'json_init': 'libqpdf/qpdf/auto_job_json_init.hh',
        'man': 'manual/qpdf.1',
        # Others are added in top
    }
    # SUMS contains a checksum for each source and destination and is
    # used to detect whether we're up to date without having to force
    # recompilation all the time. This way the build can invoke this
    # script unconditionally without causing stuff to rebuild every
    # time.
    SUMS = 'job.sums'

    def main(self, args=sys.argv[1:], prog=whoami):
        options = self.parse_args(args, prog)
        self.top(options)

    def parse_args(self, args, prog):
        parser = argparse.ArgumentParser(
            prog=prog,
            description='Generate files for QPDFJob',
        )
        mxg = parser.add_mutually_exclusive_group(required=True)
        mxg.add_argument('--check',
                         help='update checksums if files are not up to date',
                         action='store_true', default=False)
        mxg.add_argument('--generate',
                         help='generate files from sources',
                         action='store_true', default=False)
        return parser.parse_args(args)

    def top(self, options):
        """
        Processes a configuration job file and generates an appropriate output
        or performs checks based on the provided options.

        This function reads a 'job.yml' file to process configurations, generates
        declarations for option tables, and updates configuration destinations
        based on data from the job file. Depending on the mode specified in the
        options, it checks for modified input hashes, generates outputs, or exits
        with an appropriate message.

        :param options: The configuration options specifying the mode of operation
            (e.g., 'check', 'generate') and other relevant settings.
        :return: None
        """
        with open('job.yml', 'r') as f:
            data = yaml.safe_load(f.read())
        # config_decls maps a config key from an option in "options"
        # (from job.yml) to a list of declarations. A declaration is
        # generated for each config method for that option table.
        self.config_decls = {}
        # Keep track of which configs we've declared since we can have
        # option tables share a config class, as with the encryption
        # tables.
        self.declared_configs = set()

        # Update DESTS -- see above. This ensures that each config
        # class's contents are included in job.sums.
        for o in data['options']:
            config = o.get('config', None)
            if config is not None:
                self.DESTS[config] = f'include/qpdf/auto_job_{config}.hh'
                self.config_decls[config] = []

        if self.check_hashes():
            exit(0)
        elif options.check:
            exit(f'{whoami}: auto job inputs have changed')
        elif options.generate:
            self.generate(data)
        else:
            exit(f'{whoami} unknown mode')

    def get_hashes(self):
        """
        Calculates and retrieves the SHA-256 hashes of files from source and destination paths.

        Summary:
        This method iterates over a collection of file paths from both source and
        destination attributes, calculates the SHA-256 hash for each existing file,
        and returns a dictionary containing the file paths and their corresponding
        hashes. If a file is not found, it is skipped.

        :return: A dictionary where keys are file paths (as str) and values are their
                 SHA-256 hashes (as str).
        :rtype: dict
        """
        hashes = {}
        for i in sorted([*self.SOURCES, *self.DESTS.values()]):
            m = hashlib.sha256()
            try:
                with open(i, 'rb') as f:
                    m.update(f.read())
                hashes[i] = m.hexdigest()
            except FileNotFoundError:
                pass
        return hashes

    def check_hashes(self):
        """
        Compares the current hashes with previously stored hashes in a file and determines if they match.

        This method retrieves the current hashes using the `get_hashes` method, attempts to read
        the stored hashes from a file, and compares the two. If there are mismatches or missing
        entries in any direction, relevant messages are printed. The purpose is to validate
        whether the current environment or configuration remains consistent with previous runs.

        :raises Exception: If an error occurs during file reading or processing.
        :return: A boolean value indicating whether the current hashes match the previously
                 stored hashes.
        :rtype: bool
        """
        hashes = self.get_hashes()
        match = False
        try:
            old_hashes = {}
            with open(self.SUMS, 'r') as f:
                for line in f.readlines():
                    m = re.match(r'^(\S+) (\S+)\s*$', line)
                    if m:
                        old_hashes[m.group(1)] = m.group(2)
            match = old_hashes == hashes
            if not match:
                # Write to stdout, not stderr. What we write to stderr
                # is visible in a normal build. Writing to stdout will
                # hide it in that case but expose it if you directly
                # run ./generate_auto_job --check as in CI.
                print(f'*** {whoami} hash mismatches ***')
                match = False
                for k, v in hashes.items():
                    if k not in old_hashes:
                        print(f'  {k} is not in job.sums')
                    elif v != old_hashes[k]:
                        print(f'  {k} was modified')
                for k in old_hashes:
                    if k not in hashes:
                        print(f'  {k} disappeared')
        except Exception:
            pass
        return match

    def update_hashes(self):
        """
        Updates the hash values and writes them to a specified file.

        This method retrieves a collection of hash values by calling the `get_hashes`
        method. It then writes these hash values to a predefined file specified by
        the `SUMS` attribute. The file will include a header line indicating the
        source of the generated hashes.

        :raises IOError: If the file specified by `SUMS` cannot be opened
            or written to.
        :return: None
        """
        hashes = self.get_hashes()
        with open(self.SUMS, 'w') as f:
            print(f'# Generated by {whoami}', file=f)
            for k, v in hashes.items():
                print(f'{k} {v}', file=f)

    def generate_doc(self, df, f, f_man):
        """
        Generates documentation and help-related functionalities for a given parser.

        This function processes input data to generate structured help content, associating
        it with topics or options. It splits the large function operation into smaller, manageable
        static sub-components, ensuring maintainability while dealing with large content. In addition
        to generating help texts for topics and options, it formats and outputs content into
        various formats including string outputs and man page style documentation.

        :param df: A file-like object from which content is read to generate topics
            and option-based help content.
        :param f: A writable file-like object where the generated static functions
            and help configuration for the parser are written.
        :param f_man: A writable file-like object where formatted manual page text
            is generated.
        :return: None
        """
        st_top = 0
        st_topic = 1
        st_option = 2
        st_option_help = 3
        state = st_top

        indent = None
        topic = None
        option = None
        short_text = None
        long_text = None

        # Generate a bunch of short static functions rather than a big
        # member function for help. Some compilers have problems with
        # very large member functions in classes in anonymous
        # namespaces.

        help_files = 0
        help_lines = 0

        self.all_topics = set(self.options_without_help)
        self.referenced_topics = set()

        def set_indent(x):
            nonlocal indent
            indent = ' ' * len(x)

        def append_long_text(line, topic):
            """
            Appends a line of text to a growing long text description for a specific topic.
            The function processes lines, either appending them to the existing long text
            or finalizing the long text for a topic if the line doesn't match the expected
            indentation. Raises an error if a finalized long text is missing for a given
            topic. Additionally, updates the collection of referenced topics if applicable.

            :param line: A string representing the current line of text being processed.
            :param topic: A string representing the topic associated with the long text.
            :return: A boolean indicating whether the long text for the topic has been
                finalized.
            """
            nonlocal indent, long_text
            if line == '\n':
                long_text += '\n'
            elif line.startswith(indent):
                long_text += line[len(indent):]
            else:
                long_text = long_text.strip()
                if long_text == '':
                    raise Exception(f'missing long text for {topic}')
                long_text += '\n'
                if 'help' not in topic:
                    # Help for --help itself has --help=... not
                    # referring to specific options.
                    for i in re.finditer(r'--help=([^\.\s]+)', long_text):
                        self.referenced_topics.add(i.group(1))
                return True
            return False

        def manify(text):
            """
            Transforms a given text into a format suitable for a manual page.

            This function processes the input text and modifies its formatting
            to match the conventions typically used in manual pages. It converts
            list items that start with '- ' into equivalent `.IP \\[bu]` formatted
            entries and handles indented lines associated with such list items.

            :param text: The input plain text to be transformed for manual page
                formatting.
            :type text: str
            :return: The modified text formatted for manual pages.
            :rtype: str
            """
            lines = text.split('\n')
            out = []
            last_was_item = False
            for line in lines:
                if line.startswith('- '):
                    last_was_item = True
                    out.append('.IP \\[bu]')
                    out.append(line[2:])
                elif last_was_item and line.startswith('  '):
                    out.append(line[2:])
                else:
                    last_was_item = False
                    out.append(line)
            return '\n'.join(out)

        last_option_topic = ''
        lineno = 0
        for line in df.readlines():
            if help_lines == 0:
                if help_files > 0:
                    print('}', file=f)
                help_files += 1
                help_lines += 1
                print(f'static void add_help_{help_files}(QPDFArgParser& ap)\n'
                      '{', file=f)
            lineno += 1
            if state == st_top:
                m = re.match(r'^(\s*\.\. )help-topic (\S+): (.*)$', line)
                if m:
                    set_indent(m.group(1))
                    topic = m.group(2)
                    short_text = m.group(3)
                    long_text = ''
                    state = st_topic
                    continue
                m = re.match(
                    r'^(\s*\.\. )qpdf:option:: (([^=\[\s]+)([\[= ](.+))?)$',
                    line)
                if m:
                    if topic is None:
                        raise Exception('option seen before topic')
                    set_indent(m.group(1))
                    option = m.group(3)
                    synopsis = m.group(2)
                    if synopsis.endswith('`'):
                        raise Exception(
                            f'stray ` at end of option line (line {lineno})')
                    if synopsis != option:
                        long_text = synopsis + '\n'
                    else:
                        long_text = ''
                    state = st_option
                    continue
            elif state == st_topic:
                if append_long_text(line, topic):
                    self.all_topics.add(topic)
                    print(f'ap.addHelpTopic("{topic}", "{short_text}",'
                          f' R"({long_text})");', file=f)
                    print(f'.SH {topic.upper()} ({short_text})', file=f_man)
                    print(manify(long_text), file=f_man, end='')
                    help_lines += 1
                    state = st_top
            elif state == st_option:
                if line == '\n' or line.startswith(indent):
                    m = re.match(r'^(\s*\.\. )help: (.*)$', line)
                    if m:
                        set_indent(m.group(1))
                        short_text = m.group(2)
                        state = st_option_help
                else:
                    raise Exception('option without help text')
            elif state == st_option_help:
                if append_long_text(line, option):
                    if option in self.options_without_help:
                        self.options_without_help.remove(option)
                    else:
                        raise Exception(
                            f'help for unknown option {option},'
                            f' lineno={lineno}')
                    if option not in self.help_options:
                        self.jdata[option[2:]]['help'] = short_text
                    print(f'ap.addOptionHelp("{option}", "{topic}",'
                          f' "{short_text}", R"({long_text})");', file=f)
                    if last_option_topic != topic:
                        print('.PP\nRelated Options:', file=f_man)
                    last_option_topic = topic
                    print(f'.TP\n.B {option} \\-\\- {short_text}', file=f_man)
                    print(manify(long_text), file=f_man, end='')
                    help_lines += 1
                    state = st_top
            if help_lines == 20:
                help_lines = 0
        print('}', file=f)
        print('static void add_help(QPDFArgParser& ap)\n{', file=f)
        for i in range(help_files):
            print(f'    add_help_{i+1}(ap);', file=f)
        print('ap.addHelpFooter("For detailed help, visit'
              ' the qpdf manual: https://qpdf.readthedocs.io\\n");', file=f)
        print('}\n', file=f)
        print('''.SH SEE ALSO
.PP
For a summary of qpdf's options, please run \\fBqpdf \\-\\-help\\fR.
A complete manual can be found at https://qpdf.readthedocs.io.
''', file=f_man, end='')
        for i in self.referenced_topics:
            if i not in self.all_topics:
                raise Exception(f'help text referenced --help={i}')
        for i in self.options_without_help:
            raise Exception(
                'Options without help: ' +
                ', '.join(self.options_without_help))

    def generate(self, data):
        """
        Generates and writes various files associated with job configuration, initialization, schema,
        documentation, and other related tasks. The method performs necessary validations, extracts
        version information, processes job configurations, and prepares structured outputs for different
        file types. It ensures completeness of help options and updates necessary data hashes.

        :param data: Input data required for generating and preparing files.
        :type data: any

        :return: None
        """
        warn(f'{whoami}: regenerating auto job files')
        self.validate(data)

        version = None
        with open('CMakeLists.txt', 'r') as f:
            for line in f.readlines():
                if line.strip().startswith('VERSION '):
                    version = line.strip().split(' ')[1]
        if version is None:
            raise Exception("can't read version from CMakeLists.txt")

        # Keep track of which options are help options since they are
        # handled specially. Add the built-in help options to tables
        # that we populate as we read job.yml since we won't encounter
        # these in job.yml
        self.help_options = set(
            ['--completion-bash', '--completion-zsh', '--help']
        )
        # Keep track of which options we have encountered but haven't
        # seen help text for. This enables us to report if any option
        # is missing help.
        self.options_without_help = set(self.help_options)

        # Compute the information needed for generated files and write
        # the files.
        self.prepare(data)
        with write_file(self.DESTS['decl']) as f:
            print(BANNER, file=f)
            for i in self.decls:
                print(i, file=f)
        with write_file(self.DESTS['init']) as f:
            print(BANNER, file=f)
            for i in self.init:
                print(i, file=f)
        with write_file(self.DESTS['help']) as f:
            with write_file(self.DESTS['man']) as f_man:
                print(MAN_BANNER, file=f_man, end='')
                with open('manual/qpdf.1.in', 'r') as m_in:
                    for line in m_in.readlines():
                        line = line.replace('@PROJECT_VERSION@', version)
                        print(line, file=f_man, end='')
                with open('manual/cli.rst', 'r') as df:
                    print(BANNER, file=f)
                    self.generate_doc(df, f, f_man)

        # Compute the json files after the config and arg parsing
        # files. We need to have full information about all the
        # options before we can generate the schema. Generating the
        # schema also generates the json header files.
        self.generate_schema(data)
        with write_file(self.DESTS['schema']) as f:
            print('static constexpr char const* JOB_SCHEMA_DATA = R"(' +
                  json.dumps(self.schema, indent=2, separators=(',', ': ')) +
                  ')";', file=f)
        for k, v in self.config_decls.items():
            with write_file(self.DESTS[k]) as f:
                print(BANNER, file=f)
                for i in v:
                    print(i, file=f)
        with write_file(self.DESTS['json_decl']) as f:
            print(BANNER, file=f)
            for i in self.json_decls:
                print(i, file=f)
        with write_file(self.DESTS['json_init']) as f:
            print(BANNER, file=f)
            for i in self.json_init:
                print(i, file=f)

        # Update hashes last to ensure that this will be rerun in the
        # event of a failure.
        self.update_hashes()
        # DON'T ADD CODE TO generate AFTER update_hashes

    def handle_trivial(self, i, identifier, cfg, prefix, kind, v):
        """
        Handle a "trivial" option by generating initialization and declaration statements for configuration methods.
        A trivial option is one where the handler does nothing other than calling the
        configuration method with the same name (switched to camelCase).

        The function processes different option types (`bare`, `required_parameter`, `optional_parameter`,
        `required_choices`, `optional_choices`) and generates corresponding initialization code for adding
        these options. It also generates or updates configuration method declarations as needed.

        :param i: Identifier of the option.
        :param identifier: Name of the configuration method to be invoked.
        :param cfg: Object representing the configuration context.
        :param prefix: Prefix used for generating configuration method names.
        :param kind: Type of the option (e.g., "bare", "required_parameter", etc.).
        :param v: Additional value or information associated with specific types of options.
        :return: None
        """
        decl_arg = 1
        decl_arg_optional = False
        if kind == 'bare':
            decl_arg = 0
            self.init.append(f'this->ap.addBare("{i}", '
                             f'[this](){{{cfg}->{identifier}();}});')
        elif kind == 'required_parameter':
            self.init.append(
                f'this->ap.addRequiredParameter("{i}", '
                f'[this](std::string const& x){{{cfg}->{identifier}(x);}}'
                f', "{v}");')
        elif kind == 'optional_parameter':
            decl_arg_optional = True
            self.init.append(
                f'this->ap.addOptionalParameter("{i}", '
                f'[this](std::string const& x){{{cfg}->{identifier}(x);}});')
        elif kind == 'required_choices':
            self.init.append(
                f'this->ap.addChoices("{i}", '
                f'[this](std::string const& x){{{cfg}->{identifier}(x);}}'
                f', true, {v}_choices);')
        elif kind == 'optional_choices':
            decl_arg_optional = True
            self.init.append(
                f'this->ap.addChoices("{i}", '
                f'[this](std::string const& x){{{cfg}->{identifier}(x);}}'
                f', false, {v}_choices);')

        # Generate declarations for config methods separately by
        # config object.
        config_prefix = prefix + 'Config'
        arg = ''
        if decl_arg:
            arg = 'std::string const& parameter'
        fn = f'{config_prefix}* {identifier}({arg})'
        if fn not in self.declared_configs:
            self.declared_configs.add(fn)
            self.config_decls[cfg].append(f'QPDF_DLL {fn};')
            if decl_arg_optional:
                # Rather than making the parameter optional, add an
                # overloaded method that takes no arguments. This
                # strategy enables us to change an option from bare to
                # optional_parameter or optional_choices without
                # breaking binary compatibility. The overloaded
                # methods both have to be implemented manually. They
                # are not automatically called, so if you forget,
                # someone will get a link error if they try to call
                # one.
                self.config_decls[cfg].append(
                    f'QPDF_DLL {config_prefix}* {identifier}();')

    def handle_flag(self, i, identifier, kind, v):
        """
        Handles flag processing and declaration for commands that require custom
        manual handlers. Depending on the type of the flag, it declares the
        appropriate handler method and registers it. They have to be implemented
        manually in QPDFJob_argv.cc. You get compiler/linker errors for any
        missing methods.This function associates the flag identifier with specific
        handlers for various flag types such as bare, parameter-based, or
        choice-based flags.

        :param i: The command-line flag or parameter.
        :type i: str
        :param identifier: Name used to identify the flag handler method.
        :type identifier: str
        :param kind: The type of flag. Supported types are 'bare',
                     'required_parameter', 'optional_parameter',
                     'required_choices', or 'optional_choices'.
        :type kind: str
        :param v: Additional value or information required for choices or
                  parameter flags; unused in the case of 'bare' flags.
        :type v: str
        :return: None
        :rtype: None
        """
        if kind == 'bare':
            self.decls.append(f'void {identifier}();')
            self.init.append(f'this->ap.addBare("{i}", '
                             f'b(&ArgParser::{identifier}));')
        elif kind == 'required_parameter':
            self.decls.append(f'void {identifier}(std::string const&);')
            self.init.append(f'this->ap.addRequiredParameter("{i}", '
                             f'p(&ArgParser::{identifier})'
                             f', "{v}");')
        elif kind == 'optional_parameter':
            self.decls.append(f'void {identifier}(std::string const&);')
            self.init.append(f'this->ap.addOptionalParameter("{i}", '
                             f'p(&ArgParser::{identifier}));')
        elif kind == 'required_choices':
            self.decls.append(f'void {identifier}(std::string const&);')
            self.init.append(f'this->ap.addChoices("{i}", '
                             f'p(&ArgParser::{identifier})'
                             f', true, {v}_choices);')
        elif kind == 'optional_choices':
            self.decls.append(f'void {identifier}(std::string const&);')
            self.init.append(f'this->ap.addChoices("{i}", '
                             f'p(&ArgParser::{identifier})'
                             f', false, {v}_choices);')

    def prepare(self, data):
        """
        Prepare the internal configuration of options and handlers for argument parsing.

        This function sets up various internal data structures essential for managing
        argv handlers, option table declarations, initialization procedures, and other
        required data for parsing command-line arguments. It also assists in registering
        handlers, generating constants, and organizing choices for easier use in the
        argument parsing process.

        :param data: The input dictionary containing configuration for options, choices,
            and other relevant details to initialize argument parsing.
        :type data: dict
        :return: None
        """
        self.decls = []         # argv handler declarations
        self.init = []          # initialize arg parsing code
        self.json_decls = []    # json handler declarations
        self.json_init = []     # initialize json handlers
        self.jdata = {}         # running data used for json generate
        self.by_table = {}      # table information by name for easy lookup

        def add_jdata(flag, table, details):
            """
            Add JSON data to track flags and their respective details and table associations.

            This function manages the relationship between a given flag and the
            tables it references. It also ensures that appropriate options are
            added if the table specified is "help". For other tables, it maintains
            the corresponding details against the flag in the JSON structure.

            :param flag: A string identifying a specific flag for tracking.
            :param table: A string specifying the table the flag is associated with.
            :param details: A dictionary containing details associated with the given table
                            for the specified flag.
            :return: None
            """
            nonlocal self
            if table == 'help':
                self.help_options.add(f'--{flag}')
            elif flag in self.jdata:
                self.jdata[flag]['tables'][table] = details
            else:
                self.jdata[flag] = {
                    'tables': {table: details},
                }

        # helper functions
        self.init.append('auto b = [this](void (ArgParser::*f)()) {')
        self.init.append('    return QPDFArgParser::bindBare(f, this);')
        self.init.append('};')
        self.init.append(
            'auto p = [this](void (ArgParser::*f)(std::string const&)) {')
        self.init.append('    return QPDFArgParser::bindParam(f, this);')
        self.init.append('};')
        self.init.append('')

        # static variables for each set of choices for choices options
        for k, v in data['choices'].items():
            s = f'static char const* {k}_choices[] = {{'
            for i in v:
                s += f'"{i}", '
            s += '0};'
            self.init.append(s)
            self.json_init.append(s)
        self.init.append('')
        self.json_init.append('')

        # constants for the table names to reduce hard-coding strings
        # in the handlers
        for o in data['options']:
            table = o['table']
            if table in ('main', 'help'):
                continue
            i = self.to_identifier(table, 'O', True)
            self.decls.append(f'static constexpr char const* {i} = "{table}";')
        self.decls.append('')

        # Walk through all the options adding declarations for the
        # option handlers and initialization code to register the
        # handlers in QPDFArgParser. For "trivial" cases,
        # QPDFArgParser will call the corresponding config method
        # automatically. Otherwise, it will declare a handler that you
        # have to explicitly implement.

        # If you add a new option table, you have to set config to the
        # name of a member variable that you declare in the ArgParser
        # class in QPDFJob_argv.cc. Then there should be an option in
        # the main table, also listed as manual in job.yml, that
        # switches to it. See implementations of any of the existing
        # options that do this for examples.
        for o in data['options']:
            table = o['table']
            config = o.get('config', None)
            table_prefix = o.get('prefix', '')
            arg_prefix = 'arg' + table_prefix
            config_prefix = o.get('config_prefix', table_prefix)
            manual = o.get('manual', [])
            json_prefix = table_prefix or table
            self.by_table[json_prefix] = {
                'config': config,
                'manual': manual,
            }
            if table == 'main':
                self.init.append('this->ap.selectMainOptionTable();')
            elif table == 'help':
                self.init.append('this->ap.selectHelpOptionTable();')
            else:
                identifier = self.to_identifier(table, 'argEnd', False)
                self.init.append(f'this->ap.registerOptionTable("{table}",'
                                 f' b(&ArgParser::{identifier}));')
            if o.get('positional', False):
                self.decls.append(
                    f'void {arg_prefix}Positional(std::string const&);')
                self.init.append('this->ap.addPositional('
                                 f'p(&ArgParser::{arg_prefix}Positional));')

            flags = {}
            for i in o.get('bare', []):
                flags[i] = ['bare', None]
            for i, v in o.get('required_parameter', {}).items():
                flags[i] = ['required_parameter', v]
            for i in o.get('optional_parameter', []):
                flags[i] = ['optional_parameter', None]
            for i, v in o.get('required_choices', {}).items():
                flags[i] = ['required_choices', v]
            for i, v in o.get('optional_choices', {}).items():
                flags[i] = ['optional_choices', v]
                self.options_without_help.add(f'--{i}')

            for i, [kind, v] in flags.items():
                self.options_without_help.add(f'--{i}')
                add_jdata(i, json_prefix, [kind, v])
                if config is None or i in manual:
                    identifier = self.to_identifier(i, arg_prefix, False)
                    self.handle_flag(i, identifier, kind, v)
                else:
                    identifier = self.to_identifier(i, '', False)
                    self.handle_trivial(
                        i, identifier, config, config_prefix, kind, v)

            # Subsidiary options tables need end methods to do any
            # final checking within the option table. Final checking
            # for the main option table is handled by
            # checkConfiguration, which is called explicitly in the
            # QPDFJob code.
            if table not in ('main', 'help'):
                identifier = self.to_identifier(table, 'argEnd', False)
                self.decls.append(f'void {identifier}();')

    def handle_json_trivial(self, flag_key, fdata):
        """
        Handles JSON configuration based on the specified flag, data, and the associated
        table configuration. Determines the type of operation based on the kind of entry
        and appends the appropriate initialization string to the `json_init`.

        :param flag_key: A string representing the key used to modify the configuration.
        :param fdata: A dictionary containing table information and other associated
                      data necessary for configuration handling.
        :return: None
        """
        config = None
        for t, [kind, v] in fdata['tables'].items():
            # We have determined that all tables, if multiple, have
            # the same config.
            tdata = self.by_table[t]
            config = tdata['config']
        if kind == 'bare':
            self.json_init.append(
                f'addBare([this]() {{ {config}->{flag_key}(); }});')
        elif kind == 'required_parameter' or kind == 'optional_parameter':
            # Optional parameters end up just being the empty string,
            # so the handler has to deal with it. The empty string is
            # also allowed for non-optional.
            self.json_init.append(
                f'addParameter([this](std::string const& p)'
                f' {{ {config}->{flag_key}(p); }});')
        elif kind == 'required_choices':
            self.json_init.append(
                f'addChoices({v}_choices, true,'
                f' [this](std::string const& p)'
                f' {{ {config}->{flag_key}(p); }});')
        elif kind == 'optional_choices':
            self.json_init.append(
                f'addChoices({v}_choices, false,'
                f' [this](std::string const& p)'
                f' {{ {config}->{flag_key}(p); }});')

    def handle_json_manual(self, path):
        """
        Processes a given file path to create a method name in camelCase format
        and appends corresponding declarations and invocation to internal lists.

        :param path: The file path to process as a string
        :type path: str
        :return: None
        """
        method = re.sub(r'\.([a-zA-Z0-9])',
                        lambda x: x.group(1).upper(),
                        f'setup{path}')
        self.json_decls.append(f'void {method}();')
        self.json_init.append(f'{method}();')

    def option_to_json_key(self, s):
        return self.to_identifier(s, '', False)

    def flag_to_schema_key(self, k):
        if k.startswith('_'):
            schema_key = k[1:]
        else:
            schema_key = re.sub(r'[^\.]+\.', '', k)
        return self.option_to_json_key(schema_key)

    def build_schema(self, j, path, flag, expected, options_seen):
        # j: the part of data from "json" in job.yml as we traverse it
        # path: a string representation of the path in the json
        # flag: the command-line flag
        # expected: a map of command-line options we expect to eventually see
        # options_seen: which options we have seen so far

        # As described in job.yml, the json can have keys that don't
        # map to options. This includes keys whose values are
        # dictionaries as well as keys that correspond to positional
        # arguments. These start with _ and get their help from
        # job.yml. Things that correspond to options get their help
        # from the help text we gathered from cli.rst.

        if flag in expected:
            options_seen.add(flag)
        elif flag.startswith('__'):
            # This marks a flag that has no JSON equivalent because it
            # is handled in some other fashion.
            options_seen.add(flag[2:])
            return
        elif isinstance(j, str):
            if not flag.startswith('_'):
                raise Exception(f'json: {flag} has a description'
                                ' but doesn\'t start with _')
        elif not (flag == '' or flag.startswith('_')):
            raise Exception(f'json: unknown key {flag}')

        # The logic here is subtle and makes sense if you understand
        # how our JSON schemas work. They are described in JSON.hh,
        # but basically, if you see a dictionary, the schema should
        # have a dictionary with the same keys whose values are
        # descriptive. If you see an array, the array should have
        # single member that describes each element of the array. See
        # JSON.hh for details.

        # See comments in QPDFJob_json.cc in the Handlers class
        # declaration to understand how and why the methods called
        # here work. The idea is that Handlers keeps a stack of
        # JSONHandler shared pointers so that we can register our
        # handlers in the right place as we go.
        if isinstance(j, dict):
            schema_value = {}
            if flag:
                identifier = self.to_identifier(path, '', False)
                self.json_decls.append(f'void begin{identifier}(JSON);')
                self.json_decls.append(f'void end{identifier}();')
                self.json_init.append(
                    f'beginDict(bindJSON(&Handlers::begin{identifier}),'
                    f' bindBare(&Handlers::end{identifier})); // {path}')
            for k, v in j.items():
                schema_key = self.flag_to_schema_key(k)
                subpath = f'{path}.{schema_key}'
                self.json_init.append(f'pushKey("{schema_key}");')
                schema_value[schema_key] = self.build_schema(
                    v, subpath, k, expected, options_seen)
                self.json_init.append(f'popHandler(); // key: {schema_key}')
        elif isinstance(j, list):
            if len(j) != 1:
                raise Exception('json contains array with length != 1')
            identifier = self.to_identifier(path, '', False)
            self.json_decls.append(f'void begin{identifier}Array(JSON);')
            self.json_decls.append(f'void end{identifier}Array();')
            self.json_init.append(
                f'beginArray(bindJSON(&Handlers::begin{identifier}Array),'
                f' bindBare(&Handlers::end{identifier}Array));'
                f' // {path}[]')
            schema_value = [
                self.build_schema(j[0], path, flag,
                                  expected, options_seen)
            ]
            self.json_init.append(
                f'popHandler(); // array: {path}[]')
        else:
            schema_value = j
            if schema_value is None:
                schema_value = re.sub(
                    r'--([^\s=]+)',
                    lambda x: self.option_to_json_key(x.group(1)),
                    expected[flag]['help'])
            is_trivial = False
            if flag in expected:
                is_trivial = True
                common_config = None
                for t in expected[flag]['tables']:
                    tdata = self.by_table[t]
                    if flag in tdata['manual']:
                        is_trivial = False
                    if common_config is None:
                        common_config = tdata['config']
                    elif common_config != tdata['config']:
                        is_trivial = False
            config_key = self.flag_to_schema_key(flag)
            if is_trivial:
                self.handle_json_trivial(config_key, expected[flag])
            else:
                self.handle_json_manual(path)
        return schema_value

    def generate_schema(self, data):
        """
        Generate and validate a JSON schema based on the given data.

        This method ensures that every command-line option is represented
        in the JSON schema described in the `data` parameter. It checks
        for consistency between the defined command-line options and the
        JSON section of the input data. If any option is missing or
        inconsistent, an exception is raised. The method builds a schema
        by incorporating help information provided in the data, and it
        registers JSON handlers that correspond with the created schema.

        :param data: A dictionary containing the JSON section and option
                     information necessary for schema generation and
                     validation.
                       - `data['json']`: Dictionary describing the JSON
                         schema structure.
        :return: None
        :raises Exception: If there is a mismatch between expected
                           options and options specified in the JSON
                           schema.
        """
        # Check to make sure that every command-line option is
        # represented in data['json']. Build a list of options that we
        # expect. If an option appears once, we just expect to see it
        # once. If it appears in more than one options table, we need
        # to see a separate version of it for each option table. It is
        # represented in job.yml prepended with the table prefix. The
        # table prefix is removed in the schema. Example: "password"
        # appears multiple times, so the json section of job.yml has
        # main.password, uo.password, etc. But most options appear
        # only once, so we can just list them as they are. There is a
        # nearly exact match between option tables and dictionary in
        # the job json schema, but it's not perfect because of how
        # positional arguments are handled, so we have to do this
        # extra work. Information about which tables a particular
        # option appeared in is gathered up in prepare().
        expected = {}
        for k, v in self.jdata.items():
            tables = v['tables']
            if len(tables) == 1:
                expected[k] = {**v}
            else:
                for t in sorted(tables):
                    expected[f'{t}.{k}'] = {**v}
        options_seen = set()

        # Walk through the json information building the schema as we
        # go. This verifies consistency between command-line options
        # and the json section of the data and builds up a schema by
        # populating with help information as available. In addition
        # to generating the schema, we declare and register json
        # handlers that correspond with it. That way, we can first
        # check a job JSON file against the schema, and if it matches,
        # we have fewer error opportunities while calling handlers.
        self.schema = self.build_schema(
            data['json'], '', '', expected, options_seen)
        if options_seen != set(expected.keys()):
            raise Exception('missing from json: ' +
                            str(set(expected.keys()) - options_seen))

    def check_keys(self, what, d, exp):
        """
        Validates that the provided dictionary has the expected set of keys. If the
        `d` parameter is not a dictionary or contains unknown keys that are not
        in the `exp` set, the program will terminate with an error message.

        :param what: A descriptive string indicating the purpose of the dictionary.
                     Used in error messages to provide context.
        :type what: str
        :param d: The dictionary to be inspected for its keys.
        :type d: dict
        :param exp: A set of expected keys that `d` should adhere to.
        :type exp: set
        :return: None. Terminates the program with an error message if the
                 validation fails.
        """
        if not isinstance(d, dict):
            exit(f'{what} is not a dictionary')
        actual = set(d.keys())
        extra = actual - exp
        if extra:
            exit(f'{what}: unknown keys = {extra}')

    def validate(self, data):
        """
        Validates the given data against a set of required keys for proper structure. Checks are
        performed for both the top-level keys and the keys within the 'options' list in the data.
        This ensures that the data has the required configuration necessary for processing.

        :param data: The input data to be validated. It is expected to be a dictionary containing
            the keys 'choices', 'options', and 'json'. The 'options' key must contain a list
            whose elements are dictionaries with specific required keys.
        :type data: dict
        :return: None. The function does not return any value but may raise exceptions if the
            validation fails.
        :rtype: None
        :raises ValueError: If any required keys are missing in the provided data for either the
            top-level or within the 'options' list.
        :raises TypeError: If the structure or type of the input 'data' is incorrect.
        """
        self.check_keys('top', data, set(
            ['choices', 'options', 'json']))
        for o in data['options']:
            self.check_keys('top', o, set(
                ['table', 'prefix', 'config', 'config_prefix',
                 'manual', 'bare', 'positional',
                 'optional_parameter', 'required_parameter',
                 'required_choices', 'optional_choices']))

    def to_identifier(self, label, prefix, const):
        """
        Converts a given label into a valid identifier by replacing invalid characters
        and applying formatting rules. The method ensures that the resulting identifier
        conforms to naming conventions, optionally prepending a prefix and enforcing
        uppercase for constants.

        :param label: The input label string that needs to be converted into an
            identifier.
        :type label: str
        :param prefix: An optional prefix to prepend to the identifier. If not
            provided, no prefix is added.
        :type prefix: str
        :param const: Indicates whether the output identifier should be treated as
            a constant. If True, the identifier is converted to uppercase and prefixed.
        :type const: bool
        :return: A valid identifier string generated from the input label based on the
            provided parameters.
        :rtype: str
        """
        identifier = re.sub(r'[^a-zA-Z0-9]', '_', label)
        if const:
            identifier = f'{prefix}_{identifier.upper()}'
        else:
            if prefix:
                identifier = f'{prefix}_{identifier}'
            identifier = re.sub(r'_([a-z])',
                                lambda x: x.group(1).upper(),
                                identifier).replace('_', '')
        return identifier


if __name__ == '__main__':
    try:
        os.chdir(os.path.dirname(os.path.realpath(__file__)))
        Main().main()
    except KeyboardInterrupt:
        exit(130)