Commit 303f0de147b7b3ff56f0e0328b62b1ab9b64cf85

Authored by Christian Herdtweck
Committed by Philippe Lagadec
1 parent f13931f4

ooxml: Implement wrapper around zip subfile streams

Often you want to know size of a stream or re-set it to start, both of
which are not provided by ZipExtFile (result of ZipFile.open). Implement
both in own class.
Showing 1 changed file with 109 additions and 9 deletions
oletools/ooxml.py
... ... @@ -14,6 +14,7 @@ import sys
14 14 import logging
15 15 from zipfile import ZipFile, BadZipfile
16 16 from os.path import splitext
  17 +import io
17 18  
18 19 # import lxml or ElementTree for XML parsing:
19 20 try:
... ... @@ -124,25 +125,124 @@ def is_ooxml(filename):
124 125 return False
125 126  
126 127  
127   -class ZipFileResetable(io.BufferedIOBase):
128   - """ A file-like object like zip.open returns them, only can seek() to 0
  128 +class ZipSubFile(object):
  129 + """ A file-like object like ZipFile.open returns them, with size and seek()
129 130  
130 131 ZipFile.open() gives file handles that can be read but not seek()ed since
131 132 the file is being decompressed in the background. This class implements a
132 133 reset() function which corresponds to a seek to 0 (which just closes the
133   - stream and re-opens it behind the scenes
  134 + stream and re-opens it behind the scenes.)
  135 + --> can be used e.g. for olefile.isOleFile()
  136 +
  137 + Can be used as a context manager::
  138 +
  139 + with zipfile.ZipFile('file.zip') as zipper:
  140 + with ZipSubFile(zipper, 'subfile') as handle:
  141 + print('subfile in file.zip has size {0}, starts with {1}'
  142 + .format(handle.size, handle.read(20)))
  143 + handle.reset()
  144 +
  145 + Attributes always present:
  146 + container: the containing zip file
  147 + name: name of file within zip file
  148 + mode: open-mode, 'r' per default
  149 + size: size of the stream (constructor arg or taken from ZipFile.getinfo)
  150 +
  151 + Attributes only not-None after open() and before close():
  152 + handle: direkt handle to subfile stream, created by ZipFile.open()
  153 + pos: current position within stream
134 154 """
135 155  
136   - def __init__(self, container, filename, mode=None):
  156 + def __init__(self, container, filename, mode='r', size=None):
  157 + """ remember all necessary vars but do not open yet """
137 158 self.container = container
138 159 self.name = filename
139   - self.handle = container.open(filename, mode)
  160 + if size is None:
  161 + self.size = container.getinfo(filename).file_size
  162 + logging.debug('zip stream has size {0}'.format(self.size))
  163 + else:
  164 + self.size = size
  165 + if 'w' in mode.lower():
  166 + raise ValueError('Can only read, mode "{0}" not allowed'
  167 + .format(mode))
  168 + self.mode = mode
  169 + self.handle = None
  170 + self.pos = None
  171 +
  172 + def open(self):
  173 + """ open subfile for reading; open mode given to constructor before """
  174 + if self.handle is not None:
  175 + raise IOError('re-opening file not supported!')
  176 + self.handle = self.container.open(self.name, self.mode)
  177 + self.pos = 0
  178 + return self
  179 +
  180 + def write(self, *args, **kwargs): # pylint: disable=unused-argument,no-self-use
  181 + """ write is not allowed """
  182 + raise IOError('writing not implemented')
  183 +
  184 + def read(self, size=-1):
  185 + """
  186 + read given number of bytes (or all data) from stream
140 187  
141   - def read(self, size=None):
142   - pass
  188 + returns bytes (i.e. str in python2, bytes in python3)
  189 + """
  190 + if size is None:
  191 + self.pos = self.size
  192 + else:
  193 + self.pos += size
  194 + return self.handle.read(size)
143 195  
144 196 def seek(self, pos, offset):
145   - pass
  197 + """ re-position point so read() will continue elsewhere
  198 +
  199 + only re-positioning to start of file is allowed
  200 + """
  201 + if pos == 0 and offset == io.SEEK_SET:
  202 + self.reset()
  203 + elif pos == -self.pos and offset == io.SEEK_CUR:
  204 + self.reset()
  205 + else:
  206 + raise NotImplementedError('could reset() and read()')
  207 +
  208 + def tell(self):
  209 + """ inform about position of next read """
  210 + return self.pos
  211 +
  212 + def reset(self):
  213 + """ close and re-open """
  214 + self.close()
  215 + self.open()
  216 +
  217 + def close(self):
  218 + """ close file """
  219 + if self.handle is not None:
  220 + self.handle.close()
  221 + self.pos = None
  222 + self.handle = None
  223 +
  224 + def __enter__(self):
  225 + """ start of context manager; opens the file """
  226 + self.open()
  227 + return self
  228 +
  229 + def __exit__(self, *args, **kwargs):
  230 + """ end of context manager; closes the file """
  231 + self.close()
  232 +
  233 + def __str__(self):
  234 + """ creates a nice textual representation for this object """
  235 + if self.handle is None:
  236 + status = 'closed'
  237 + elif self.pos == 0:
  238 + status = 'open, at start'
  239 + elif self.pos >= self.size:
  240 + status = 'open, at end'
  241 + else:
  242 + status = 'open, at pos {0}'.format(self.pos)
  243 +
  244 + return '[ZipSubFile {0} (size {1}, mode {2}, {3})]' \
  245 + .format(self.name, self.size, self.mode, status)
146 246  
147 247  
148 248 class XmlParser(object):
... ... @@ -233,7 +333,7 @@ class XmlParser(object):
233 333  
234 334 yields 3-tuples (filename, content_type, file_handle) where
235 335 content_type is based on filename or default for extension or is None,
236   - and file_handle is an open read-only handle for the file
  336 + and file_handle is a ZipSubFile
237 337 """
238 338 if not self.did_iter_all:
239 339 logging.warning('Did not iterate through complete file. '
... ...