Commit d4eb585e075304a7fd77fb7b8803e25a31908e70
1 parent
a7d1050e
ooxml: re-implement complete seek(); add attr closed
OleFileIO requires a complete seek() and checks for closed attribute. Also added some commented debug print commands to ZipSubFile
Showing
1 changed file
with
69 additions
and
32 deletions
oletools/ooxml.py
| ... | ... | @@ -130,13 +130,13 @@ class ZipSubFile(object): |
| 130 | 130 | |
| 131 | 131 | ZipFile.open() gives file handles that can be read but not seek()ed since |
| 132 | 132 | the file is being decompressed in the background. This class implements a |
| 133 | - reset() function which corresponds to a seek to 0 (which just closes the | |
| 134 | - stream and re-opens it behind the scenes.) | |
| 135 | - --> can be used e.g. for olefile.isOleFile() | |
| 133 | + reset() function (close and re-open stream) and a seek() that uses it. | |
| 134 | + --> can be used as argument to olefile.OleFileIO and olefile.isOleFile() | |
| 136 | 135 | |
| 137 | 136 | Can be used as a context manager:: |
| 138 | 137 | |
| 139 | 138 | with zipfile.ZipFile('file.zip') as zipper: |
| 139 | + # replaces with zipper.open(subfile) as handle: | |
| 140 | 140 | with ZipSubFile(zipper, 'subfile') as handle: |
| 141 | 141 | print('subfile in file.zip has size {0}, starts with {1}' |
| 142 | 142 | .format(handle.size, handle.read(20))) |
| ... | ... | @@ -147,10 +147,12 @@ class ZipSubFile(object): |
| 147 | 147 | name: name of file within zip file |
| 148 | 148 | mode: open-mode, 'r' per default |
| 149 | 149 | size: size of the stream (constructor arg or taken from ZipFile.getinfo) |
| 150 | + closed: True if there was an open() but no close() since then | |
| 150 | 151 | |
| 151 | 152 | Attributes only not-None after open() and before close(): |
| 152 | - handle: direkt handle to subfile stream, created by ZipFile.open() | |
| 153 | - pos: current position within stream | |
| 153 | + handle: direct handle to subfile stream, created by ZipFile.open() | |
| 154 | + pos: current position within stream (can deviate from actual position in | |
| 155 | + self.handle if we fake jump to end) | |
| 154 | 156 | """ |
| 155 | 157 | |
| 156 | 158 | def __init__(self, container, filename, mode='r', size=None): |
| ... | ... | @@ -168,6 +170,7 @@ class ZipSubFile(object): |
| 168 | 170 | self.mode = mode |
| 169 | 171 | self.handle = None |
| 170 | 172 | self.pos = None |
| 173 | + self.closed = True | |
| 171 | 174 | |
| 172 | 175 | def open(self): |
| 173 | 176 | """ open subfile for reading; open mode given to constructor before """ |
| ... | ... | @@ -175,6 +178,8 @@ class ZipSubFile(object): |
| 175 | 178 | raise IOError('re-opening file not supported!') |
| 176 | 179 | self.handle = self.container.open(self.name, self.mode) |
| 177 | 180 | self.pos = 0 |
| 181 | + self.closed = False | |
| 182 | + # print('ZipSubFile: opened; size={}'.format(self.size)) | |
| 178 | 183 | return self |
| 179 | 184 | |
| 180 | 185 | def write(self, *args, **kwargs): # pylint: disable=unused-argument,no-self-use |
| ... | ... | @@ -187,59 +192,91 @@ class ZipSubFile(object): |
| 187 | 192 | |
| 188 | 193 | returns bytes (i.e. str in python2, bytes in python3) |
| 189 | 194 | """ |
| 190 | - if size is None: | |
| 191 | - self.pos = self.size | |
| 192 | - else: | |
| 193 | - self.pos += size | |
| 194 | - return self.handle.read(size) | |
| 195 | - | |
| 196 | - def seek(self, pos, offset): | |
| 197 | - """ re-position point so read() will continue elsewhere | |
| 198 | - | |
| 199 | - only re-positioning to start of file is allowed | |
| 200 | - """ | |
| 201 | - CHUNK_SIZE = 4096 | |
| 202 | - if pos == 0 and offset == io.SEEK_SET: | |
| 203 | - self.reset() | |
| 195 | + if self.handle is None: | |
| 196 | + raise IOError('read on closed handle') | |
| 197 | + if self.pos >= self.size: | |
| 198 | + # print('ZipSubFile: read fake at end') | |
| 199 | + return b'' # fake being at the end, even if we are not | |
| 200 | + data = self.handle.read(size) | |
| 201 | + self.pos += len(data) | |
| 202 | + # print('ZipSubFile: read {} bytes, pos now {}'.format(size, self.pos)) | |
| 203 | + return data | |
| 204 | + | |
| 205 | + def seek(self, pos, offset=io.SEEK_SET): | |
| 206 | + """ re-position point so read() will continue elsewhere """ | |
| 207 | + # calc target position from self.pos, pos and offset | |
| 208 | + if offset == io.SEEK_SET: | |
| 209 | + new_pos = pos | |
| 204 | 210 | elif offset == io.SEEK_CUR: |
| 205 | - if pos == -self.pos: | |
| 206 | - self.reset() | |
| 207 | - elif pos == 0: | |
| 208 | - return | |
| 209 | - elif pos > 0: | |
| 210 | - skipped = 0 | |
| 211 | - n_chunks, leftover = divmod(pos, CHUNK_SIZE) | |
| 212 | - for _ in range(n_chunks): | |
| 213 | - self.read(CHUNK_SIZE) # just read and discard | |
| 214 | - self.read(leftover) | |
| 215 | - else: | |
| 216 | - raise NotImplementedError('could reset() and read()?') | |
| 211 | + new_pos = self.pos + pos | |
| 212 | + elif offset == io.SEEK_END: | |
| 213 | + new_pos = self.size + pos | |
| 217 | 214 | else: |
| 218 | - raise NotImplementedError('could reset() and read()?') | |
| 215 | + raise ValueError("invalid offset {0}, need SEEK_* constant" | |
| 216 | + .format(offset)) | |
| 217 | + | |
| 218 | + # now get to that position, doing reads and resets as necessary | |
| 219 | + if new_pos < 0: | |
| 220 | + # print('ZipSubFile: Error: seek to {}'.format(new_pos)) | |
| 221 | + raise IOError('Seek beyond start of file not allowed') | |
| 222 | + elif new_pos == self.pos: | |
| 223 | + # print('ZipSubFile: nothing to do') | |
| 224 | + pass | |
| 225 | + elif new_pos == 0: | |
| 226 | + # print('ZipSubFile: seek to start') | |
| 227 | + self.reset() | |
| 228 | + elif new_pos < self.pos: | |
| 229 | + # print('ZipSubFile: seek back') | |
| 230 | + self.reset() | |
| 231 | + self._seek_skip(new_pos) # --> read --> update self.pos | |
| 232 | + elif new_pos < self.size: | |
| 233 | + # print('ZipSubFile: seek forward') | |
| 234 | + self._seek_skip(new_pos - self.pos) # --> read --> update self.pos | |
| 235 | + else: # new_pos >= self.size | |
| 236 | + # print('ZipSubFile: seek to end') | |
| 237 | + self.pos = new_pos # fake being at the end; remember pos >= size | |
| 238 | + | |
| 239 | + def _seek_skip(self, to_skip): | |
| 240 | + """ helper for seek: skip forward by given amount using read() """ | |
| 241 | + # print('ZipSubFile: seek by skipping {} bytes starting at {}' | |
| 242 | + # .format(self.pos, to_skip)) | |
| 243 | + CHUNK_SIZE = 4096 | |
| 244 | + n_chunks, leftover = divmod(to_skip, CHUNK_SIZE) | |
| 245 | + for _ in range(n_chunks): | |
| 246 | + self.read(CHUNK_SIZE) # just read and discard | |
| 247 | + self.read(leftover) | |
| 248 | + # print('ZipSubFile: seek by skipping done, pos now {}' | |
| 249 | + # .format(self.pos)) | |
| 219 | 250 | |
| 220 | 251 | def tell(self): |
| 221 | 252 | """ inform about position of next read """ |
| 253 | + # print('ZipSubFile: tell-ing we are at {}'.format(self.pos)) | |
| 222 | 254 | return self.pos |
| 223 | 255 | |
| 224 | 256 | def reset(self): |
| 225 | 257 | """ close and re-open """ |
| 258 | + # print('ZipSubFile: resetting') | |
| 226 | 259 | self.close() |
| 227 | 260 | self.open() |
| 228 | 261 | |
| 229 | 262 | def close(self): |
| 230 | 263 | """ close file """ |
| 264 | + # print('ZipSubFile: closing') | |
| 231 | 265 | if self.handle is not None: |
| 232 | 266 | self.handle.close() |
| 233 | 267 | self.pos = None |
| 234 | 268 | self.handle = None |
| 269 | + self.closed = True | |
| 235 | 270 | |
| 236 | 271 | def __enter__(self): |
| 237 | 272 | """ start of context manager; opens the file """ |
| 273 | + # print('ZipSubFile: entering context') | |
| 238 | 274 | self.open() |
| 239 | 275 | return self |
| 240 | 276 | |
| 241 | 277 | def __exit__(self, *args, **kwargs): |
| 242 | 278 | """ end of context manager; closes the file """ |
| 279 | + # print('ZipSubFile: exiting context') | |
| 243 | 280 | self.close() |
| 244 | 281 | |
| 245 | 282 | def __str__(self): | ... | ... |