Commit 303f0de147b7b3ff56f0e0328b62b1ab9b64cf85

Authored by Christian Herdtweck
Committed by Philippe Lagadec
1 parent f13931f4

ooxml: Implement wrapper around zip subfile streams

Often you want to know size of a stream or re-set it to start, both of
which are not provided by ZipExtFile (result of ZipFile.open). Implement
both in own class.
Showing 1 changed file with 109 additions and 9 deletions
oletools/ooxml.py
@@ -14,6 +14,7 @@ import sys @@ -14,6 +14,7 @@ import sys
14 import logging 14 import logging
15 from zipfile import ZipFile, BadZipfile 15 from zipfile import ZipFile, BadZipfile
16 from os.path import splitext 16 from os.path import splitext
  17 +import io
17 18
18 # import lxml or ElementTree for XML parsing: 19 # import lxml or ElementTree for XML parsing:
19 try: 20 try:
@@ -124,25 +125,124 @@ def is_ooxml(filename): @@ -124,25 +125,124 @@ def is_ooxml(filename):
124 return False 125 return False
125 126
126 127
127 -class ZipFileResetable(io.BufferedIOBase):  
128 - """ A file-like object like zip.open returns them, only can seek() to 0 128 +class ZipSubFile(object):
  129 + """ A file-like object like ZipFile.open returns them, with size and seek()
129 130
130 ZipFile.open() gives file handles that can be read but not seek()ed since 131 ZipFile.open() gives file handles that can be read but not seek()ed since
131 the file is being decompressed in the background. This class implements a 132 the file is being decompressed in the background. This class implements a
132 reset() function which corresponds to a seek to 0 (which just closes the 133 reset() function which corresponds to a seek to 0 (which just closes the
133 - stream and re-opens it behind the scenes 134 + stream and re-opens it behind the scenes.)
  135 + --> can be used e.g. for olefile.isOleFile()
  136 +
  137 + Can be used as a context manager::
  138 +
  139 + with zipfile.ZipFile('file.zip') as zipper:
  140 + with ZipSubFile(zipper, 'subfile') as handle:
  141 + print('subfile in file.zip has size {0}, starts with {1}'
  142 + .format(handle.size, handle.read(20)))
  143 + handle.reset()
  144 +
  145 + Attributes always present:
  146 + container: the containing zip file
  147 + name: name of file within zip file
  148 + mode: open-mode, 'r' per default
  149 + size: size of the stream (constructor arg or taken from ZipFile.getinfo)
  150 +
  151 + Attributes only not-None after open() and before close():
  152 + handle: direkt handle to subfile stream, created by ZipFile.open()
  153 + pos: current position within stream
134 """ 154 """
135 155
136 - def __init__(self, container, filename, mode=None): 156 + def __init__(self, container, filename, mode='r', size=None):
  157 + """ remember all necessary vars but do not open yet """
137 self.container = container 158 self.container = container
138 self.name = filename 159 self.name = filename
139 - self.handle = container.open(filename, mode) 160 + if size is None:
  161 + self.size = container.getinfo(filename).file_size
  162 + logging.debug('zip stream has size {0}'.format(self.size))
  163 + else:
  164 + self.size = size
  165 + if 'w' in mode.lower():
  166 + raise ValueError('Can only read, mode "{0}" not allowed'
  167 + .format(mode))
  168 + self.mode = mode
  169 + self.handle = None
  170 + self.pos = None
  171 +
  172 + def open(self):
  173 + """ open subfile for reading; open mode given to constructor before """
  174 + if self.handle is not None:
  175 + raise IOError('re-opening file not supported!')
  176 + self.handle = self.container.open(self.name, self.mode)
  177 + self.pos = 0
  178 + return self
  179 +
  180 + def write(self, *args, **kwargs): # pylint: disable=unused-argument,no-self-use
  181 + """ write is not allowed """
  182 + raise IOError('writing not implemented')
  183 +
  184 + def read(self, size=-1):
  185 + """
  186 + read given number of bytes (or all data) from stream
140 187
141 - def read(self, size=None):  
142 - pass 188 + returns bytes (i.e. str in python2, bytes in python3)
  189 + """
  190 + if size is None:
  191 + self.pos = self.size
  192 + else:
  193 + self.pos += size
  194 + return self.handle.read(size)
143 195
144 def seek(self, pos, offset): 196 def seek(self, pos, offset):
145 - pass 197 + """ re-position point so read() will continue elsewhere
  198 +
  199 + only re-positioning to start of file is allowed
  200 + """
  201 + if pos == 0 and offset == io.SEEK_SET:
  202 + self.reset()
  203 + elif pos == -self.pos and offset == io.SEEK_CUR:
  204 + self.reset()
  205 + else:
  206 + raise NotImplementedError('could reset() and read()')
  207 +
  208 + def tell(self):
  209 + """ inform about position of next read """
  210 + return self.pos
  211 +
  212 + def reset(self):
  213 + """ close and re-open """
  214 + self.close()
  215 + self.open()
  216 +
  217 + def close(self):
  218 + """ close file """
  219 + if self.handle is not None:
  220 + self.handle.close()
  221 + self.pos = None
  222 + self.handle = None
  223 +
  224 + def __enter__(self):
  225 + """ start of context manager; opens the file """
  226 + self.open()
  227 + return self
  228 +
  229 + def __exit__(self, *args, **kwargs):
  230 + """ end of context manager; closes the file """
  231 + self.close()
  232 +
  233 + def __str__(self):
  234 + """ creates a nice textual representation for this object """
  235 + if self.handle is None:
  236 + status = 'closed'
  237 + elif self.pos == 0:
  238 + status = 'open, at start'
  239 + elif self.pos >= self.size:
  240 + status = 'open, at end'
  241 + else:
  242 + status = 'open, at pos {0}'.format(self.pos)
  243 +
  244 + return '[ZipSubFile {0} (size {1}, mode {2}, {3})]' \
  245 + .format(self.name, self.size, self.mode, status)
146 246
147 247
148 class XmlParser(object): 248 class XmlParser(object):
@@ -233,7 +333,7 @@ class XmlParser(object): @@ -233,7 +333,7 @@ class XmlParser(object):
233 333
234 yields 3-tuples (filename, content_type, file_handle) where 334 yields 3-tuples (filename, content_type, file_handle) where
235 content_type is based on filename or default for extension or is None, 335 content_type is based on filename or default for extension or is None,
236 - and file_handle is an open read-only handle for the file 336 + and file_handle is a ZipSubFile
237 """ 337 """
238 if not self.did_iter_all: 338 if not self.did_iter_all:
239 logging.warning('Did not iterate through complete file. ' 339 logging.warning('Did not iterate through complete file. '