Commit d4eb585e075304a7fd77fb7b8803e25a31908e70

Authored by Christian Herdtweck
1 parent a7d1050e

ooxml: re-implement complete seek(); add attr closed

OleFileIO requires a complete seek() and checks for closed attribute.

Also added some commented debug print commands to ZipSubFile
Showing 1 changed file with 69 additions and 32 deletions
oletools/ooxml.py
@@ -130,13 +130,13 @@ class ZipSubFile(object): @@ -130,13 +130,13 @@ class ZipSubFile(object):
130 130
131 ZipFile.open() gives file handles that can be read but not seek()ed since 131 ZipFile.open() gives file handles that can be read but not seek()ed since
132 the file is being decompressed in the background. This class implements a 132 the file is being decompressed in the background. This class implements a
133 - reset() function which corresponds to a seek to 0 (which just closes the  
134 - stream and re-opens it behind the scenes.)  
135 - --> can be used e.g. for olefile.isOleFile() 133 + reset() function (close and re-open stream) and a seek() that uses it.
  134 + --> can be used as argument to olefile.OleFileIO and olefile.isOleFile()
136 135
137 Can be used as a context manager:: 136 Can be used as a context manager::
138 137
139 with zipfile.ZipFile('file.zip') as zipper: 138 with zipfile.ZipFile('file.zip') as zipper:
  139 + # replaces with zipper.open(subfile) as handle:
140 with ZipSubFile(zipper, 'subfile') as handle: 140 with ZipSubFile(zipper, 'subfile') as handle:
141 print('subfile in file.zip has size {0}, starts with {1}' 141 print('subfile in file.zip has size {0}, starts with {1}'
142 .format(handle.size, handle.read(20))) 142 .format(handle.size, handle.read(20)))
@@ -147,10 +147,12 @@ class ZipSubFile(object): @@ -147,10 +147,12 @@ class ZipSubFile(object):
147 name: name of file within zip file 147 name: name of file within zip file
148 mode: open-mode, 'r' per default 148 mode: open-mode, 'r' per default
149 size: size of the stream (constructor arg or taken from ZipFile.getinfo) 149 size: size of the stream (constructor arg or taken from ZipFile.getinfo)
  150 + closed: True if there was an open() but no close() since then
150 151
151 Attributes only not-None after open() and before close(): 152 Attributes only not-None after open() and before close():
152 - handle: direkt handle to subfile stream, created by ZipFile.open()  
153 - pos: current position within stream 153 + handle: direct handle to subfile stream, created by ZipFile.open()
  154 + pos: current position within stream (can deviate from actual position in
  155 + self.handle if we fake jump to end)
154 """ 156 """
155 157
156 def __init__(self, container, filename, mode='r', size=None): 158 def __init__(self, container, filename, mode='r', size=None):
@@ -168,6 +170,7 @@ class ZipSubFile(object): @@ -168,6 +170,7 @@ class ZipSubFile(object):
168 self.mode = mode 170 self.mode = mode
169 self.handle = None 171 self.handle = None
170 self.pos = None 172 self.pos = None
  173 + self.closed = True
171 174
172 def open(self): 175 def open(self):
173 """ open subfile for reading; open mode given to constructor before """ 176 """ open subfile for reading; open mode given to constructor before """
@@ -175,6 +178,8 @@ class ZipSubFile(object): @@ -175,6 +178,8 @@ class ZipSubFile(object):
175 raise IOError('re-opening file not supported!') 178 raise IOError('re-opening file not supported!')
176 self.handle = self.container.open(self.name, self.mode) 179 self.handle = self.container.open(self.name, self.mode)
177 self.pos = 0 180 self.pos = 0
  181 + self.closed = False
  182 + # print('ZipSubFile: opened; size={}'.format(self.size))
178 return self 183 return self
179 184
180 def write(self, *args, **kwargs): # pylint: disable=unused-argument,no-self-use 185 def write(self, *args, **kwargs): # pylint: disable=unused-argument,no-self-use
@@ -187,59 +192,91 @@ class ZipSubFile(object): @@ -187,59 +192,91 @@ class ZipSubFile(object):
187 192
188 returns bytes (i.e. str in python2, bytes in python3) 193 returns bytes (i.e. str in python2, bytes in python3)
189 """ 194 """
190 - if size is None:  
191 - self.pos = self.size  
192 - else:  
193 - self.pos += size  
194 - return self.handle.read(size)  
195 -  
196 - def seek(self, pos, offset):  
197 - """ re-position point so read() will continue elsewhere  
198 -  
199 - only re-positioning to start of file is allowed  
200 - """  
201 - CHUNK_SIZE = 4096  
202 - if pos == 0 and offset == io.SEEK_SET:  
203 - self.reset() 195 + if self.handle is None:
  196 + raise IOError('read on closed handle')
  197 + if self.pos >= self.size:
  198 + # print('ZipSubFile: read fake at end')
  199 + return b'' # fake being at the end, even if we are not
  200 + data = self.handle.read(size)
  201 + self.pos += len(data)
  202 + # print('ZipSubFile: read {} bytes, pos now {}'.format(size, self.pos))
  203 + return data
  204 +
  205 + def seek(self, pos, offset=io.SEEK_SET):
  206 + """ re-position point so read() will continue elsewhere """
  207 + # calc target position from self.pos, pos and offset
  208 + if offset == io.SEEK_SET:
  209 + new_pos = pos
204 elif offset == io.SEEK_CUR: 210 elif offset == io.SEEK_CUR:
205 - if pos == -self.pos:  
206 - self.reset()  
207 - elif pos == 0:  
208 - return  
209 - elif pos > 0:  
210 - skipped = 0  
211 - n_chunks, leftover = divmod(pos, CHUNK_SIZE)  
212 - for _ in range(n_chunks):  
213 - self.read(CHUNK_SIZE) # just read and discard  
214 - self.read(leftover)  
215 - else:  
216 - raise NotImplementedError('could reset() and read()?') 211 + new_pos = self.pos + pos
  212 + elif offset == io.SEEK_END:
  213 + new_pos = self.size + pos
217 else: 214 else:
218 - raise NotImplementedError('could reset() and read()?') 215 + raise ValueError("invalid offset {0}, need SEEK_* constant"
  216 + .format(offset))
  217 +
  218 + # now get to that position, doing reads and resets as necessary
  219 + if new_pos < 0:
  220 + # print('ZipSubFile: Error: seek to {}'.format(new_pos))
  221 + raise IOError('Seek beyond start of file not allowed')
  222 + elif new_pos == self.pos:
  223 + # print('ZipSubFile: nothing to do')
  224 + pass
  225 + elif new_pos == 0:
  226 + # print('ZipSubFile: seek to start')
  227 + self.reset()
  228 + elif new_pos < self.pos:
  229 + # print('ZipSubFile: seek back')
  230 + self.reset()
  231 + self._seek_skip(new_pos) # --> read --> update self.pos
  232 + elif new_pos < self.size:
  233 + # print('ZipSubFile: seek forward')
  234 + self._seek_skip(new_pos - self.pos) # --> read --> update self.pos
  235 + else: # new_pos >= self.size
  236 + # print('ZipSubFile: seek to end')
  237 + self.pos = new_pos # fake being at the end; remember pos >= size
  238 +
  239 + def _seek_skip(self, to_skip):
  240 + """ helper for seek: skip forward by given amount using read() """
  241 + # print('ZipSubFile: seek by skipping {} bytes starting at {}'
  242 + # .format(self.pos, to_skip))
  243 + CHUNK_SIZE = 4096
  244 + n_chunks, leftover = divmod(to_skip, CHUNK_SIZE)
  245 + for _ in range(n_chunks):
  246 + self.read(CHUNK_SIZE) # just read and discard
  247 + self.read(leftover)
  248 + # print('ZipSubFile: seek by skipping done, pos now {}'
  249 + # .format(self.pos))
219 250
220 def tell(self): 251 def tell(self):
221 """ inform about position of next read """ 252 """ inform about position of next read """
  253 + # print('ZipSubFile: tell-ing we are at {}'.format(self.pos))
222 return self.pos 254 return self.pos
223 255
224 def reset(self): 256 def reset(self):
225 """ close and re-open """ 257 """ close and re-open """
  258 + # print('ZipSubFile: resetting')
226 self.close() 259 self.close()
227 self.open() 260 self.open()
228 261
229 def close(self): 262 def close(self):
230 """ close file """ 263 """ close file """
  264 + # print('ZipSubFile: closing')
231 if self.handle is not None: 265 if self.handle is not None:
232 self.handle.close() 266 self.handle.close()
233 self.pos = None 267 self.pos = None
234 self.handle = None 268 self.handle = None
  269 + self.closed = True
235 270
236 def __enter__(self): 271 def __enter__(self):
237 """ start of context manager; opens the file """ 272 """ start of context manager; opens the file """
  273 + # print('ZipSubFile: entering context')
238 self.open() 274 self.open()
239 return self 275 return self
240 276
241 def __exit__(self, *args, **kwargs): 277 def __exit__(self, *args, **kwargs):
242 """ end of context manager; closes the file """ 278 """ end of context manager; closes the file """
  279 + # print('ZipSubFile: exiting context')
243 self.close() 280 self.close()
244 281
245 def __str__(self): 282 def __str__(self):