Commit d4eb585e075304a7fd77fb7b8803e25a31908e70

Authored by Christian Herdtweck
1 parent a7d1050e

ooxml: re-implement complete seek(); add attr closed

OleFileIO requires a complete seek() and checks for closed attribute.

Also added some commented debug print commands to ZipSubFile
Showing 1 changed file with 69 additions and 32 deletions
oletools/ooxml.py
... ... @@ -130,13 +130,13 @@ class ZipSubFile(object):
130 130  
131 131 ZipFile.open() gives file handles that can be read but not seek()ed since
132 132 the file is being decompressed in the background. This class implements a
133   - reset() function which corresponds to a seek to 0 (which just closes the
134   - stream and re-opens it behind the scenes.)
135   - --> can be used e.g. for olefile.isOleFile()
  133 + reset() function (close and re-open stream) and a seek() that uses it.
  134 + --> can be used as argument to olefile.OleFileIO and olefile.isOleFile()
136 135  
137 136 Can be used as a context manager::
138 137  
139 138 with zipfile.ZipFile('file.zip') as zipper:
  139 + # replaces with zipper.open(subfile) as handle:
140 140 with ZipSubFile(zipper, 'subfile') as handle:
141 141 print('subfile in file.zip has size {0}, starts with {1}'
142 142 .format(handle.size, handle.read(20)))
... ... @@ -147,10 +147,12 @@ class ZipSubFile(object):
147 147 name: name of file within zip file
148 148 mode: open-mode, 'r' per default
149 149 size: size of the stream (constructor arg or taken from ZipFile.getinfo)
  150 + closed: True if there was an open() but no close() since then
150 151  
151 152 Attributes only not-None after open() and before close():
152   - handle: direkt handle to subfile stream, created by ZipFile.open()
153   - pos: current position within stream
  153 + handle: direct handle to subfile stream, created by ZipFile.open()
  154 + pos: current position within stream (can deviate from actual position in
  155 + self.handle if we fake jump to end)
154 156 """
155 157  
156 158 def __init__(self, container, filename, mode='r', size=None):
... ... @@ -168,6 +170,7 @@ class ZipSubFile(object):
168 170 self.mode = mode
169 171 self.handle = None
170 172 self.pos = None
  173 + self.closed = True
171 174  
172 175 def open(self):
173 176 """ open subfile for reading; open mode given to constructor before """
... ... @@ -175,6 +178,8 @@ class ZipSubFile(object):
175 178 raise IOError('re-opening file not supported!')
176 179 self.handle = self.container.open(self.name, self.mode)
177 180 self.pos = 0
  181 + self.closed = False
  182 + # print('ZipSubFile: opened; size={}'.format(self.size))
178 183 return self
179 184  
180 185 def write(self, *args, **kwargs): # pylint: disable=unused-argument,no-self-use
... ... @@ -187,59 +192,91 @@ class ZipSubFile(object):
187 192  
188 193 returns bytes (i.e. str in python2, bytes in python3)
189 194 """
190   - if size is None:
191   - self.pos = self.size
192   - else:
193   - self.pos += size
194   - return self.handle.read(size)
195   -
196   - def seek(self, pos, offset):
197   - """ re-position point so read() will continue elsewhere
198   -
199   - only re-positioning to start of file is allowed
200   - """
201   - CHUNK_SIZE = 4096
202   - if pos == 0 and offset == io.SEEK_SET:
203   - self.reset()
  195 + if self.handle is None:
  196 + raise IOError('read on closed handle')
  197 + if self.pos >= self.size:
  198 + # print('ZipSubFile: read fake at end')
  199 + return b'' # fake being at the end, even if we are not
  200 + data = self.handle.read(size)
  201 + self.pos += len(data)
  202 + # print('ZipSubFile: read {} bytes, pos now {}'.format(size, self.pos))
  203 + return data
  204 +
  205 + def seek(self, pos, offset=io.SEEK_SET):
  206 + """ re-position point so read() will continue elsewhere """
  207 + # calc target position from self.pos, pos and offset
  208 + if offset == io.SEEK_SET:
  209 + new_pos = pos
204 210 elif offset == io.SEEK_CUR:
205   - if pos == -self.pos:
206   - self.reset()
207   - elif pos == 0:
208   - return
209   - elif pos > 0:
210   - skipped = 0
211   - n_chunks, leftover = divmod(pos, CHUNK_SIZE)
212   - for _ in range(n_chunks):
213   - self.read(CHUNK_SIZE) # just read and discard
214   - self.read(leftover)
215   - else:
216   - raise NotImplementedError('could reset() and read()?')
  211 + new_pos = self.pos + pos
  212 + elif offset == io.SEEK_END:
  213 + new_pos = self.size + pos
217 214 else:
218   - raise NotImplementedError('could reset() and read()?')
  215 + raise ValueError("invalid offset {0}, need SEEK_* constant"
  216 + .format(offset))
  217 +
  218 + # now get to that position, doing reads and resets as necessary
  219 + if new_pos < 0:
  220 + # print('ZipSubFile: Error: seek to {}'.format(new_pos))
  221 + raise IOError('Seek beyond start of file not allowed')
  222 + elif new_pos == self.pos:
  223 + # print('ZipSubFile: nothing to do')
  224 + pass
  225 + elif new_pos == 0:
  226 + # print('ZipSubFile: seek to start')
  227 + self.reset()
  228 + elif new_pos < self.pos:
  229 + # print('ZipSubFile: seek back')
  230 + self.reset()
  231 + self._seek_skip(new_pos) # --> read --> update self.pos
  232 + elif new_pos < self.size:
  233 + # print('ZipSubFile: seek forward')
  234 + self._seek_skip(new_pos - self.pos) # --> read --> update self.pos
  235 + else: # new_pos >= self.size
  236 + # print('ZipSubFile: seek to end')
  237 + self.pos = new_pos # fake being at the end; remember pos >= size
  238 +
  239 + def _seek_skip(self, to_skip):
  240 + """ helper for seek: skip forward by given amount using read() """
  241 + # print('ZipSubFile: seek by skipping {} bytes starting at {}'
  242 + # .format(self.pos, to_skip))
  243 + CHUNK_SIZE = 4096
  244 + n_chunks, leftover = divmod(to_skip, CHUNK_SIZE)
  245 + for _ in range(n_chunks):
  246 + self.read(CHUNK_SIZE) # just read and discard
  247 + self.read(leftover)
  248 + # print('ZipSubFile: seek by skipping done, pos now {}'
  249 + # .format(self.pos))
219 250  
220 251 def tell(self):
221 252 """ inform about position of next read """
  253 + # print('ZipSubFile: tell-ing we are at {}'.format(self.pos))
222 254 return self.pos
223 255  
224 256 def reset(self):
225 257 """ close and re-open """
  258 + # print('ZipSubFile: resetting')
226 259 self.close()
227 260 self.open()
228 261  
229 262 def close(self):
230 263 """ close file """
  264 + # print('ZipSubFile: closing')
231 265 if self.handle is not None:
232 266 self.handle.close()
233 267 self.pos = None
234 268 self.handle = None
  269 + self.closed = True
235 270  
236 271 def __enter__(self):
237 272 """ start of context manager; opens the file """
  273 + # print('ZipSubFile: entering context')
238 274 self.open()
239 275 return self
240 276  
241 277 def __exit__(self, *args, **kwargs):
242 278 """ end of context manager; closes the file """
  279 + # print('ZipSubFile: exiting context')
243 280 self.close()
244 281  
245 282 def __str__(self):
... ...