Commit 303f0de147b7b3ff56f0e0328b62b1ab9b64cf85
Committed by
Philippe Lagadec
1 parent
f13931f4
ooxml: Implement wrapper around zip subfile streams
Often you want to know size of a stream or re-set it to start, both of which are not provided by ZipExtFile (result of ZipFile.open). Implement both in own class.
Showing
1 changed file
with
109 additions
and
9 deletions
oletools/ooxml.py
| @@ -14,6 +14,7 @@ import sys | @@ -14,6 +14,7 @@ import sys | ||
| 14 | import logging | 14 | import logging |
| 15 | from zipfile import ZipFile, BadZipfile | 15 | from zipfile import ZipFile, BadZipfile |
| 16 | from os.path import splitext | 16 | from os.path import splitext |
| 17 | +import io | ||
| 17 | 18 | ||
| 18 | # import lxml or ElementTree for XML parsing: | 19 | # import lxml or ElementTree for XML parsing: |
| 19 | try: | 20 | try: |
| @@ -124,25 +125,124 @@ def is_ooxml(filename): | @@ -124,25 +125,124 @@ def is_ooxml(filename): | ||
| 124 | return False | 125 | return False |
| 125 | 126 | ||
| 126 | 127 | ||
| 127 | -class ZipFileResetable(io.BufferedIOBase): | ||
| 128 | - """ A file-like object like zip.open returns them, only can seek() to 0 | 128 | +class ZipSubFile(object): |
| 129 | + """ A file-like object like ZipFile.open returns them, with size and seek() | ||
| 129 | 130 | ||
| 130 | ZipFile.open() gives file handles that can be read but not seek()ed since | 131 | ZipFile.open() gives file handles that can be read but not seek()ed since |
| 131 | the file is being decompressed in the background. This class implements a | 132 | the file is being decompressed in the background. This class implements a |
| 132 | reset() function which corresponds to a seek to 0 (which just closes the | 133 | reset() function which corresponds to a seek to 0 (which just closes the |
| 133 | - stream and re-opens it behind the scenes | 134 | + stream and re-opens it behind the scenes.) |
| 135 | + --> can be used e.g. for olefile.isOleFile() | ||
| 136 | + | ||
| 137 | + Can be used as a context manager:: | ||
| 138 | + | ||
| 139 | + with zipfile.ZipFile('file.zip') as zipper: | ||
| 140 | + with ZipSubFile(zipper, 'subfile') as handle: | ||
| 141 | + print('subfile in file.zip has size {0}, starts with {1}' | ||
| 142 | + .format(handle.size, handle.read(20))) | ||
| 143 | + handle.reset() | ||
| 144 | + | ||
| 145 | + Attributes always present: | ||
| 146 | + container: the containing zip file | ||
| 147 | + name: name of file within zip file | ||
| 148 | + mode: open-mode, 'r' per default | ||
| 149 | + size: size of the stream (constructor arg or taken from ZipFile.getinfo) | ||
| 150 | + | ||
| 151 | + Attributes only not-None after open() and before close(): | ||
| 152 | + handle: direkt handle to subfile stream, created by ZipFile.open() | ||
| 153 | + pos: current position within stream | ||
| 134 | """ | 154 | """ |
| 135 | 155 | ||
| 136 | - def __init__(self, container, filename, mode=None): | 156 | + def __init__(self, container, filename, mode='r', size=None): |
| 157 | + """ remember all necessary vars but do not open yet """ | ||
| 137 | self.container = container | 158 | self.container = container |
| 138 | self.name = filename | 159 | self.name = filename |
| 139 | - self.handle = container.open(filename, mode) | 160 | + if size is None: |
| 161 | + self.size = container.getinfo(filename).file_size | ||
| 162 | + logging.debug('zip stream has size {0}'.format(self.size)) | ||
| 163 | + else: | ||
| 164 | + self.size = size | ||
| 165 | + if 'w' in mode.lower(): | ||
| 166 | + raise ValueError('Can only read, mode "{0}" not allowed' | ||
| 167 | + .format(mode)) | ||
| 168 | + self.mode = mode | ||
| 169 | + self.handle = None | ||
| 170 | + self.pos = None | ||
| 171 | + | ||
| 172 | + def open(self): | ||
| 173 | + """ open subfile for reading; open mode given to constructor before """ | ||
| 174 | + if self.handle is not None: | ||
| 175 | + raise IOError('re-opening file not supported!') | ||
| 176 | + self.handle = self.container.open(self.name, self.mode) | ||
| 177 | + self.pos = 0 | ||
| 178 | + return self | ||
| 179 | + | ||
| 180 | + def write(self, *args, **kwargs): # pylint: disable=unused-argument,no-self-use | ||
| 181 | + """ write is not allowed """ | ||
| 182 | + raise IOError('writing not implemented') | ||
| 183 | + | ||
| 184 | + def read(self, size=-1): | ||
| 185 | + """ | ||
| 186 | + read given number of bytes (or all data) from stream | ||
| 140 | 187 | ||
| 141 | - def read(self, size=None): | ||
| 142 | - pass | 188 | + returns bytes (i.e. str in python2, bytes in python3) |
| 189 | + """ | ||
| 190 | + if size is None: | ||
| 191 | + self.pos = self.size | ||
| 192 | + else: | ||
| 193 | + self.pos += size | ||
| 194 | + return self.handle.read(size) | ||
| 143 | 195 | ||
| 144 | def seek(self, pos, offset): | 196 | def seek(self, pos, offset): |
| 145 | - pass | 197 | + """ re-position point so read() will continue elsewhere |
| 198 | + | ||
| 199 | + only re-positioning to start of file is allowed | ||
| 200 | + """ | ||
| 201 | + if pos == 0 and offset == io.SEEK_SET: | ||
| 202 | + self.reset() | ||
| 203 | + elif pos == -self.pos and offset == io.SEEK_CUR: | ||
| 204 | + self.reset() | ||
| 205 | + else: | ||
| 206 | + raise NotImplementedError('could reset() and read()') | ||
| 207 | + | ||
| 208 | + def tell(self): | ||
| 209 | + """ inform about position of next read """ | ||
| 210 | + return self.pos | ||
| 211 | + | ||
| 212 | + def reset(self): | ||
| 213 | + """ close and re-open """ | ||
| 214 | + self.close() | ||
| 215 | + self.open() | ||
| 216 | + | ||
| 217 | + def close(self): | ||
| 218 | + """ close file """ | ||
| 219 | + if self.handle is not None: | ||
| 220 | + self.handle.close() | ||
| 221 | + self.pos = None | ||
| 222 | + self.handle = None | ||
| 223 | + | ||
| 224 | + def __enter__(self): | ||
| 225 | + """ start of context manager; opens the file """ | ||
| 226 | + self.open() | ||
| 227 | + return self | ||
| 228 | + | ||
| 229 | + def __exit__(self, *args, **kwargs): | ||
| 230 | + """ end of context manager; closes the file """ | ||
| 231 | + self.close() | ||
| 232 | + | ||
| 233 | + def __str__(self): | ||
| 234 | + """ creates a nice textual representation for this object """ | ||
| 235 | + if self.handle is None: | ||
| 236 | + status = 'closed' | ||
| 237 | + elif self.pos == 0: | ||
| 238 | + status = 'open, at start' | ||
| 239 | + elif self.pos >= self.size: | ||
| 240 | + status = 'open, at end' | ||
| 241 | + else: | ||
| 242 | + status = 'open, at pos {0}'.format(self.pos) | ||
| 243 | + | ||
| 244 | + return '[ZipSubFile {0} (size {1}, mode {2}, {3})]' \ | ||
| 245 | + .format(self.name, self.size, self.mode, status) | ||
| 146 | 246 | ||
| 147 | 247 | ||
| 148 | class XmlParser(object): | 248 | class XmlParser(object): |
| @@ -233,7 +333,7 @@ class XmlParser(object): | @@ -233,7 +333,7 @@ class XmlParser(object): | ||
| 233 | 333 | ||
| 234 | yields 3-tuples (filename, content_type, file_handle) where | 334 | yields 3-tuples (filename, content_type, file_handle) where |
| 235 | content_type is based on filename or default for extension or is None, | 335 | content_type is based on filename or default for extension or is None, |
| 236 | - and file_handle is an open read-only handle for the file | 336 | + and file_handle is a ZipSubFile |
| 237 | """ | 337 | """ |
| 238 | if not self.did_iter_all: | 338 | if not self.did_iter_all: |
| 239 | logging.warning('Did not iterate through complete file. ' | 339 | logging.warning('Did not iterate through complete file. ' |