Commit 303f0de147b7b3ff56f0e0328b62b1ab9b64cf85
Committed by
Philippe Lagadec
1 parent
f13931f4
ooxml: Implement wrapper around zip subfile streams
Often you want to know size of a stream or re-set it to start, both of which are not provided by ZipExtFile (result of ZipFile.open). Implement both in own class.
Showing
1 changed file
with
109 additions
and
9 deletions
oletools/ooxml.py
| ... | ... | @@ -14,6 +14,7 @@ import sys |
| 14 | 14 | import logging |
| 15 | 15 | from zipfile import ZipFile, BadZipfile |
| 16 | 16 | from os.path import splitext |
| 17 | +import io | |
| 17 | 18 | |
| 18 | 19 | # import lxml or ElementTree for XML parsing: |
| 19 | 20 | try: |
| ... | ... | @@ -124,25 +125,124 @@ def is_ooxml(filename): |
| 124 | 125 | return False |
| 125 | 126 | |
| 126 | 127 | |
| 127 | -class ZipFileResetable(io.BufferedIOBase): | |
| 128 | - """ A file-like object like zip.open returns them, only can seek() to 0 | |
| 128 | +class ZipSubFile(object): | |
| 129 | + """ A file-like object like ZipFile.open returns them, with size and seek() | |
| 129 | 130 | |
| 130 | 131 | ZipFile.open() gives file handles that can be read but not seek()ed since |
| 131 | 132 | the file is being decompressed in the background. This class implements a |
| 132 | 133 | reset() function which corresponds to a seek to 0 (which just closes the |
| 133 | - stream and re-opens it behind the scenes | |
| 134 | + stream and re-opens it behind the scenes.) | |
| 135 | + --> can be used e.g. for olefile.isOleFile() | |
| 136 | + | |
| 137 | + Can be used as a context manager:: | |
| 138 | + | |
| 139 | + with zipfile.ZipFile('file.zip') as zipper: | |
| 140 | + with ZipSubFile(zipper, 'subfile') as handle: | |
| 141 | + print('subfile in file.zip has size {0}, starts with {1}' | |
| 142 | + .format(handle.size, handle.read(20))) | |
| 143 | + handle.reset() | |
| 144 | + | |
| 145 | + Attributes always present: | |
| 146 | + container: the containing zip file | |
| 147 | + name: name of file within zip file | |
| 148 | + mode: open-mode, 'r' per default | |
| 149 | + size: size of the stream (constructor arg or taken from ZipFile.getinfo) | |
| 150 | + | |
| 151 | + Attributes only not-None after open() and before close(): | |
| 152 | + handle: direkt handle to subfile stream, created by ZipFile.open() | |
| 153 | + pos: current position within stream | |
| 134 | 154 | """ |
| 135 | 155 | |
| 136 | - def __init__(self, container, filename, mode=None): | |
| 156 | + def __init__(self, container, filename, mode='r', size=None): | |
| 157 | + """ remember all necessary vars but do not open yet """ | |
| 137 | 158 | self.container = container |
| 138 | 159 | self.name = filename |
| 139 | - self.handle = container.open(filename, mode) | |
| 160 | + if size is None: | |
| 161 | + self.size = container.getinfo(filename).file_size | |
| 162 | + logging.debug('zip stream has size {0}'.format(self.size)) | |
| 163 | + else: | |
| 164 | + self.size = size | |
| 165 | + if 'w' in mode.lower(): | |
| 166 | + raise ValueError('Can only read, mode "{0}" not allowed' | |
| 167 | + .format(mode)) | |
| 168 | + self.mode = mode | |
| 169 | + self.handle = None | |
| 170 | + self.pos = None | |
| 171 | + | |
| 172 | + def open(self): | |
| 173 | + """ open subfile for reading; open mode given to constructor before """ | |
| 174 | + if self.handle is not None: | |
| 175 | + raise IOError('re-opening file not supported!') | |
| 176 | + self.handle = self.container.open(self.name, self.mode) | |
| 177 | + self.pos = 0 | |
| 178 | + return self | |
| 179 | + | |
| 180 | + def write(self, *args, **kwargs): # pylint: disable=unused-argument,no-self-use | |
| 181 | + """ write is not allowed """ | |
| 182 | + raise IOError('writing not implemented') | |
| 183 | + | |
| 184 | + def read(self, size=-1): | |
| 185 | + """ | |
| 186 | + read given number of bytes (or all data) from stream | |
| 140 | 187 | |
| 141 | - def read(self, size=None): | |
| 142 | - pass | |
| 188 | + returns bytes (i.e. str in python2, bytes in python3) | |
| 189 | + """ | |
| 190 | + if size is None: | |
| 191 | + self.pos = self.size | |
| 192 | + else: | |
| 193 | + self.pos += size | |
| 194 | + return self.handle.read(size) | |
| 143 | 195 | |
| 144 | 196 | def seek(self, pos, offset): |
| 145 | - pass | |
| 197 | + """ re-position point so read() will continue elsewhere | |
| 198 | + | |
| 199 | + only re-positioning to start of file is allowed | |
| 200 | + """ | |
| 201 | + if pos == 0 and offset == io.SEEK_SET: | |
| 202 | + self.reset() | |
| 203 | + elif pos == -self.pos and offset == io.SEEK_CUR: | |
| 204 | + self.reset() | |
| 205 | + else: | |
| 206 | + raise NotImplementedError('could reset() and read()') | |
| 207 | + | |
| 208 | + def tell(self): | |
| 209 | + """ inform about position of next read """ | |
| 210 | + return self.pos | |
| 211 | + | |
| 212 | + def reset(self): | |
| 213 | + """ close and re-open """ | |
| 214 | + self.close() | |
| 215 | + self.open() | |
| 216 | + | |
| 217 | + def close(self): | |
| 218 | + """ close file """ | |
| 219 | + if self.handle is not None: | |
| 220 | + self.handle.close() | |
| 221 | + self.pos = None | |
| 222 | + self.handle = None | |
| 223 | + | |
| 224 | + def __enter__(self): | |
| 225 | + """ start of context manager; opens the file """ | |
| 226 | + self.open() | |
| 227 | + return self | |
| 228 | + | |
| 229 | + def __exit__(self, *args, **kwargs): | |
| 230 | + """ end of context manager; closes the file """ | |
| 231 | + self.close() | |
| 232 | + | |
| 233 | + def __str__(self): | |
| 234 | + """ creates a nice textual representation for this object """ | |
| 235 | + if self.handle is None: | |
| 236 | + status = 'closed' | |
| 237 | + elif self.pos == 0: | |
| 238 | + status = 'open, at start' | |
| 239 | + elif self.pos >= self.size: | |
| 240 | + status = 'open, at end' | |
| 241 | + else: | |
| 242 | + status = 'open, at pos {0}'.format(self.pos) | |
| 243 | + | |
| 244 | + return '[ZipSubFile {0} (size {1}, mode {2}, {3})]' \ | |
| 245 | + .format(self.name, self.size, self.mode, status) | |
| 146 | 246 | |
| 147 | 247 | |
| 148 | 248 | class XmlParser(object): |
| ... | ... | @@ -233,7 +333,7 @@ class XmlParser(object): |
| 233 | 333 | |
| 234 | 334 | yields 3-tuples (filename, content_type, file_handle) where |
| 235 | 335 | content_type is based on filename or default for extension or is None, |
| 236 | - and file_handle is an open read-only handle for the file | |
| 336 | + and file_handle is a ZipSubFile | |
| 237 | 337 | """ |
| 238 | 338 | if not self.did_iter_all: |
| 239 | 339 | logging.warning('Did not iterate through complete file. ' | ... | ... |