Commit 823d07b35bee7ad7705e8a0ce73d8c1ca72ce758

Authored by Christian Herdtweck
1 parent 754ae5d9

make many ppt_parser functions generators; use decorator for try-open-except-close(stream)

oletools/olevba.py
... ... @@ -2203,27 +2203,10 @@ class VBA_Parser(object):
2203 2203 ppt_parser.enable_logging()
2204 2204 try:
2205 2205 ppt = ppt_parser.PptParser(self.ole_file, fast_fail=True)
2206   - info_container = ppt.search_vba_info()
2207   - n_infos = len(info_container)
2208   - n_macros = sum(1 for info in info_container
2209   - if info.vba_info_atom.f_has_macros > 0)
2210   - n_infos = len(ppt.search_vba_info())
2211   - # TODO: does it make sense at all to continue if n_macros == 0?
2212   - # --> no vba-info, so all storages probably ActiveX or other OLE
2213   - storages = ppt.search_vba_storage()
2214   - n_storages = len(storages)
2215   - n_compressed = 0
2216   - for storage in storages:
2217   - if storage.is_compressed:
2218   - storage_decomp = ppt.decompress_vba_storage(storage)
2219   - n_compressed += 1
2220   - else:
2221   - storage_decomp = ppt.read_vba_storage_data(storage)
2222   - self.ole_subfiles.append(VBA_Parser(None, storage_decomp,
  2206 + for vba_data in ppt.iter_vba_data():
  2207 + self.ole_subfiles.append(VBA_Parser(None, vba_data,
2223 2208 container='PptParser'))
2224   - log.info('File is PPT with {} vba infos ({} with macros) and {} '
2225   - 'vba storages ({} compressed)'
2226   - .format(n_infos, n_macros, n_storages, n_compressed))
  2209 + log.info('File is PPT')
2227 2210 self.ole_file.close() # just in case
2228 2211 self.ole_file = None # required to make other methods look at ole_subfiles
2229 2212 self.type = TYPE_PPT
... ...
oletools/ppt_parser.py
... ... @@ -21,12 +21,12 @@ References:
21 21 # - can speed-up by using less bigger struct.parse calls?
22 22 # - license
23 23 # - make buffered stream from output of iterative_decompress
24   -# - less stream open/close, possibly through decorator for open+closing?
  24 +# - maybe can merge the 2 decorators into 1? (with_opened_main_stream)
25 25 #
26 26 # CHANGELOG:
27 27 # 2016-05-04 v0.01 CH: - start parsing "Current User" stream
28 28  
29   -__version__ = '0.01'
  29 +__version__ = '0.02'
30 30  
31 31  
32 32 #--- IMPORTS ------------------------------------------------------------------
... ... @@ -1030,6 +1030,78 @@ class ExternalObjectStorageCompressed(ExternalObjectStorage):
1030 1030  
1031 1031 # === PptParser ===============================================================
1032 1032  
  1033 +def with_opened_main_stream(func):
  1034 + """ a decorator that can open and close the default stream for func
  1035 +
  1036 + to be applied only to functions in PptParser that read from default stream
  1037 + (:py:data:`MAIN_STREAM_NAME`)
  1038 +
  1039 + Decorated functions need to accept args (self, stream, ...)
  1040 + """
  1041 +
  1042 + def wrapped(self, *args, **kwargs):
  1043 + # remember who opened the stream so that function also closes it
  1044 + stream_opened_by_me = False
  1045 + try:
  1046 + # open stream if required
  1047 + if self._open_main_stream is None:
  1048 + log.debug('opening stream {!r} for {}'
  1049 + .format(MAIN_STREAM_NAME, func.__name__))
  1050 + self._open_main_stream = self.ole.openstream(MAIN_STREAM_NAME)
  1051 + stream_opened_by_me = True
  1052 +
  1053 + # run wrapped function
  1054 + return func(self, self._open_main_stream, *args, **kwargs)
  1055 +
  1056 + # error handling
  1057 + except Exception:
  1058 + if self.fast_fail:
  1059 + raise
  1060 + else:
  1061 + self._log_exception()
  1062 + finally:
  1063 + # ensure stream is closed by the one who opened it (even if error)
  1064 + if stream_opened_by_me:
  1065 + log.debug('closing stream {!r} after {}'
  1066 + .format(MAIN_STREAM_NAME, func.__name__))
  1067 + self._open_main_stream.close()
  1068 + self._open_main_stream = None
  1069 + return wrapped
  1070 +
  1071 +
  1072 +def generator_with_opened_main_stream(func):
  1073 + """ same as with_opened_main_stream but with yield instead of return """
  1074 +
  1075 + def wrapped(self, *args, **kwargs):
  1076 + # remember who opened the stream so that function also closes it
  1077 + stream_opened_by_me = False
  1078 + try:
  1079 + # open stream if required
  1080 + if self._open_main_stream is None:
  1081 + log.debug('opening stream {!r} for {}'
  1082 + .format(MAIN_STREAM_NAME, func.__name__))
  1083 + self._open_main_stream = self.ole.openstream(MAIN_STREAM_NAME)
  1084 + stream_opened_by_me = True
  1085 +
  1086 + # run actual function
  1087 + for result in func(self, self._open_main_stream, *args, **kwargs):
  1088 + yield result
  1089 +
  1090 + # error handling
  1091 + except Exception:
  1092 + if self.fast_fail:
  1093 + raise
  1094 + else:
  1095 + self._log_exception()
  1096 + finally:
  1097 + # ensure stream is closed by the one who opened it (even if error)
  1098 + if stream_opened_by_me:
  1099 + log.debug('closing stream {!r} after {}'
  1100 + .format(MAIN_STREAM_NAME, func.__name__))
  1101 + self._open_main_stream.close()
  1102 + self._open_main_stream = None
  1103 + return wrapped
  1104 +
1033 1105  
1034 1106 class PptParser(object):
1035 1107 """ Parser for PowerPoint 97-2003 specific data structures
... ... @@ -1074,6 +1146,8 @@ class PptParser(object):
1074 1146 if not MAIN_STREAM_NAME.lower() in root_streams:
1075 1147 self._fail('root', 'listdir', root_streams, MAIN_STREAM_NAME)
1076 1148  
  1149 + self._open_main_stream = None
  1150 +
1077 1151 def _log_exception(self, msg=None):
1078 1152 """ log an exception instead of raising it
1079 1153  
... ... @@ -1121,7 +1195,7 @@ class PptParser(object):
1121 1195  
1122 1196 stream = None
1123 1197 try:
1124   - log.debug('opening stream')
  1198 + log.debug('opening stream "Current User"')
1125 1199 stream = self.ole.openstream('Current User')
1126 1200 self.current_user_atom = CurrentUserAtom.extract_from(stream)
1127 1201 except Exception:
... ... @@ -1131,10 +1205,11 @@ class PptParser(object):
1131 1205 self._log_exception()
1132 1206 finally:
1133 1207 if stream is not None:
1134   - log.debug('closing stream')
  1208 + log.debug('closing stream "Current User"')
1135 1209 stream.close()
1136 1210  
1137   - def parse_persist_object_directory(self):
  1211 + @with_opened_main_stream
  1212 + def parse_persist_object_directory(self, stream):
1138 1213 """ Part 1: Construct the persist object directory """
1139 1214  
1140 1215 if self.persist_object_directory is not None:
... ... @@ -1152,100 +1227,87 @@ class PptParser(object):
1152 1227 self.persist_object_directory = {}
1153 1228 self.newest_user_edit = None
1154 1229  
1155   - stream = None
1156   - try:
1157   - log.debug('opening stream')
1158   - stream = self.ole.openstream(MAIN_STREAM_NAME)
1159   -
1160   - # Repeat steps 3 through 6 until offsetLastEdit is 0x00000000.
1161   - while offset != 0:
1162   -
1163   - # Step 2: Seek, in the PowerPoint Document Stream, to the
1164   - # offset specified by the offsetToCurrentEdit field of the
1165   - # CurrentUserAtom record identified in step 1.
1166   - stream.seek(offset, os.SEEK_SET)
1167   -
1168   - # Step 3: Read the UserEditAtom record at the current offset.
1169   - # Let this record be a live record.
1170   - user_edit = UserEditAtom.extract_from(stream, is_encrypted)
1171   - if self.newest_user_edit is None:
1172   - self.newest_user_edit = user_edit
1173   -
1174   - log.debug('checking validity')
1175   - errs = user_edit.check_validity()
1176   - if errs:
1177   - log.warning('check_validity found {} issues'
1178   - .format(len(errs)))
1179   - for err in errs:
1180   - log.warning('UserEditAtom.check_validity: {}'.format(err))
1181   - if errs and self.fast_fail:
1182   - raise errs[0]
1183   -
1184   - # Step 4: Seek to the offset specified by the
1185   - # offsetPersistDirectory field of the UserEditAtom record
1186   - # identified in step 3.
1187   - log.debug('seeking to pos {}'
1188   - .format(user_edit.offset_persist_directory))
1189   - stream.seek(user_edit.offset_persist_directory, os.SEEK_SET)
1190   -
1191   - # Step 5: Read the PersistDirectoryAtom record at the current
1192   - # offset. Let this record be a live record.
1193   - persist_dir_atom = PersistDirectoryAtom.extract_from(stream)
  1230 + # Repeat steps 3 through 6 until offsetLastEdit is 0x00000000.
  1231 + while offset != 0:
1194 1232  
1195   - log.debug('checking validity')
1196   - errs = persist_dir_atom.check_validity(offset)
1197   - if errs:
1198   - log.warning('check_validity found {} issues'
1199   - .format(len(errs)))
1200   - for err in errs:
1201   - log.warning('PersistDirectoryAtom.check_validity: {}'
1202   - .format(err))
1203   - if errs and self.fast_fail:
1204   - raise errs[0]
1205   -
1206   -
1207   - # Construct the complete persist object directory for this file
1208   - # as follows:
1209   - # - For each PersistDirectoryAtom record previously identified
1210   - # in step 5, add the persist object identifier and persist
1211   - # object stream offset pairs to the persist object directory
1212   - # starting with the PersistDirectoryAtom record last
1213   - # identified, that is, the one closest to the beginning of the
1214   - # stream.
1215   - # - Continue adding these pairs to the persist object directory
1216   - # for each PersistDirectoryAtom record in the reverse order
1217   - # that they were identified in step 5; that is, the pairs from
1218   - # the PersistDirectoryAtom record closest to the end of the
1219   - # stream are added last.
1220   - # - When adding a new pair to the persist object directory, if
1221   - # the persist object identifier already exists in the persist
1222   - # object directory, the persist object stream offset from the
1223   - # new pair replaces the existing persist object stream offset
1224   - # for that persist object identifier.
1225   - for entry in persist_dir_atom.rg_persist_dir_entry:
1226   - last_id = entry.persist_id+len(entry.rg_persist_offset)-1
1227   - log.debug('for persist IDs {}-{}, save offsets {}'
1228   - .format(entry.persist_id, last_id,
1229   - entry.rg_persist_offset))
1230   - for count, offset in enumerate(entry.rg_persist_offset):
1231   - self.persist_object_directory[entry.persist_id+count] \
1232   - = offset
1233   -
1234   - # check for more
1235   - # Step 6: Seek to the offset specified by the offsetLastEdit
1236   - # field in the UserEditAtom record identified in step 3.
1237   - offset = user_edit.offset_last_edit
1238   - except Exception:
1239   - if self.fast_fail:
1240   - raise
1241   - else:
1242   - self._log_exception()
1243   - finally:
1244   - if stream is not None:
1245   - log.debug('closing stream')
1246   - stream.close()
  1233 + # Step 2: Seek, in the PowerPoint Document Stream, to the
  1234 + # offset specified by the offsetToCurrentEdit field of the
  1235 + # CurrentUserAtom record identified in step 1.
  1236 + stream.seek(offset, os.SEEK_SET)
1247 1237  
1248   - def parse_document_persist_object(self):
  1238 + # Step 3: Read the UserEditAtom record at the current offset.
  1239 + # Let this record be a live record.
  1240 + user_edit = UserEditAtom.extract_from(stream, is_encrypted)
  1241 + if self.newest_user_edit is None:
  1242 + self.newest_user_edit = user_edit
  1243 +
  1244 + log.debug('checking validity')
  1245 + errs = user_edit.check_validity()
  1246 + if errs:
  1247 + log.warning('check_validity found {} issues'
  1248 + .format(len(errs)))
  1249 + for err in errs:
  1250 + log.warning('UserEditAtom.check_validity: {}'.format(err))
  1251 + if errs and self.fast_fail:
  1252 + raise errs[0]
  1253 +
  1254 + # Step 4: Seek to the offset specified by the
  1255 + # offsetPersistDirectory field of the UserEditAtom record
  1256 + # identified in step 3.
  1257 + log.debug('seeking to pos {}'
  1258 + .format(user_edit.offset_persist_directory))
  1259 + stream.seek(user_edit.offset_persist_directory, os.SEEK_SET)
  1260 +
  1261 + # Step 5: Read the PersistDirectoryAtom record at the current
  1262 + # offset. Let this record be a live record.
  1263 + persist_dir_atom = PersistDirectoryAtom.extract_from(stream)
  1264 +
  1265 + log.debug('checking validity')
  1266 + errs = persist_dir_atom.check_validity(offset)
  1267 + if errs:
  1268 + log.warning('check_validity found {} issues'
  1269 + .format(len(errs)))
  1270 + for err in errs:
  1271 + log.warning('PersistDirectoryAtom.check_validity: {}'
  1272 + .format(err))
  1273 + if errs and self.fast_fail:
  1274 + raise errs[0]
  1275 +
  1276 +
  1277 + # Construct the complete persist object directory for this file
  1278 + # as follows:
  1279 + # - For each PersistDirectoryAtom record previously identified
  1280 + # in step 5, add the persist object identifier and persist
  1281 + # object stream offset pairs to the persist object directory
  1282 + # starting with the PersistDirectoryAtom record last
  1283 + # identified, that is, the one closest to the beginning of the
  1284 + # stream.
  1285 + # - Continue adding these pairs to the persist object directory
  1286 + # for each PersistDirectoryAtom record in the reverse order
  1287 + # that they were identified in step 5; that is, the pairs from
  1288 + # the PersistDirectoryAtom record closest to the end of the
  1289 + # stream are added last.
  1290 + # - When adding a new pair to the persist object directory, if
  1291 + # the persist object identifier already exists in the persist
  1292 + # object directory, the persist object stream offset from the
  1293 + # new pair replaces the existing persist object stream offset
  1294 + # for that persist object identifier.
  1295 + for entry in persist_dir_atom.rg_persist_dir_entry:
  1296 + last_id = entry.persist_id+len(entry.rg_persist_offset)-1
  1297 + log.debug('for persist IDs {}-{}, save offsets {}'
  1298 + .format(entry.persist_id, last_id,
  1299 + entry.rg_persist_offset))
  1300 + for count, offset in enumerate(entry.rg_persist_offset):
  1301 + self.persist_object_directory[entry.persist_id+count] \
  1302 + = offset
  1303 +
  1304 + # check for more
  1305 + # Step 6: Seek to the offset specified by the offsetLastEdit
  1306 + # field in the UserEditAtom record identified in step 3.
  1307 + offset = user_edit.offset_last_edit
  1308 +
  1309 + @with_opened_main_stream
  1310 + def parse_document_persist_object(self, stream):
1249 1311 """ Part 2: Identify the document persist object """
1250 1312 if self.document_persist_obj is not None:
1251 1313 log.warning('re-reading and overwriting '
... ... @@ -1265,27 +1327,13 @@ class PptParser(object):
1265 1327 log.debug('newest user edit ID is {}, offset is {}'
1266 1328 .format(newest_ref, offset))
1267 1329  
1268   - stream = None
  1330 + # Step 3: Seek to the stream offset specified in step 2.
  1331 + log.debug('seek to {}'.format(offset))
  1332 + stream.seek(offset, os.SEEK_SET)
1269 1333  
1270   - try:
1271   - # Step 3: Seek to the stream offset specified in step 2.
1272   - log.debug('opening stream')
1273   - stream = self.ole.openstream(MAIN_STREAM_NAME)
1274   - log.debug('seek to {}'.format(offset))
1275   - stream.seek(offset, os.SEEK_SET)
1276   -
1277   - # Step 4: Read the DocumentContainer record at the current offset.
1278   - # Let this record be a live record.
1279   - self.document_persist_obj = DocumentContainer.extract_from(stream)
1280   - except Exception:
1281   - if self.fast_fail:
1282   - raise
1283   - else:
1284   - self._log_exception()
1285   - finally:
1286   - if stream is not None:
1287   - log.debug('closing stream')
1288   - stream.close()
  1334 + # Step 4: Read the DocumentContainer record at the current offset.
  1335 + # Let this record be a live record.
  1336 + self.document_persist_obj = DocumentContainer.extract_from(stream)
1289 1337  
1290 1338 log.debug('checking validity')
1291 1339 errs = self.document_persist_obj.check_validity()
... ... @@ -1297,7 +1345,13 @@ class PptParser(object):
1297 1345 if errs and self.fast_fail:
1298 1346 raise errs[0]
1299 1347  
1300   - def search_pattern(self, pattern, stream):
  1348 + #--------------------------------------------------------------------------
  1349 + # 2nd attempt: do not parse whole structure but search through stream and
  1350 + # yield results as they become available
  1351 + # Keep in mind that after every yield the stream position may be anything!
  1352 +
  1353 + @generator_with_opened_main_stream
  1354 + def search_pattern(self, stream, pattern):
1301 1355 """ search for pattern in stream, return indices """
1302 1356  
1303 1357 BUF_SIZE = 1024
... ... @@ -1308,30 +1362,28 @@ class PptParser(object):
1308 1362 raise ValueError('need buf > pattern to search!')
1309 1363  
1310 1364 n_reads = 0
1311   - candidates = []
1312 1365 while True:
1313 1366 start_pos = stream.tell()
1314 1367 n_reads += 1
1315   - #log.debug('read {} starting from {}'
1316   - # .format(BUF_SIZE, start_pos))
  1368 + log.debug('read {} starting from {}'
  1369 + .format(BUF_SIZE, start_pos))
1317 1370 buf = stream.read(BUF_SIZE)
1318 1371 idx = buf.find(pattern)
1319 1372 while idx != -1:
1320 1373 log.debug('found pattern at index {}'.format(start_pos+idx))
1321   - candidates.append(start_pos+idx)
  1374 + yield start_pos + idx
1322 1375 idx = buf.find(pattern, idx+1)
1323 1376  
1324 1377 if len(buf) == BUF_SIZE:
1325 1378 # move back a bit to avoid splitting of pattern through buf
1326   - stream.seek(-1 * pattern_len, os.SEEK_CUR)
  1379 + stream.seek(start_pos + BUF_SIZE - pattern_len, os.SEEK_SET)
1327 1380 else:
1328 1381 log.debug('reached end of buf (read {}<{}) after {} reads'
1329 1382 .format(len(buf), BUF_SIZE, n_reads))
1330 1383 break
1331   - return candidates
1332 1384  
1333   -
1334   - def search_vba_info(self):
  1385 + @generator_with_opened_main_stream
  1386 + def search_vba_info(self, stream):
1335 1387 """ search through stream for VBAInfoContainer, alternative to parse...
1336 1388  
1337 1389 quick-and-dirty: do not parse everything, just look for right bytes
... ... @@ -1348,51 +1400,37 @@ class PptParser(object):
1348 1400 rec_len=VBAInfoContainer.RECORD_LENGTH) \
1349 1401 + VBAInfoAtom.generate_pattern(
1350 1402 rec_len=VBAInfoAtom.RECORD_LENGTH)
1351   - stream = None
1352   - try:
1353   - log.debug('opening stream')
1354   - stream = self.ole.openstream(MAIN_STREAM_NAME)
1355   -
1356   - # look for candidate positions
1357   - candidates = self.search_pattern(pattern, stream)
1358   -
1359   - # try parse
1360   - containers = []
1361   - for idx in candidates:
1362   - # assume that in stream at idx there is a VBAInfoContainer
1363   - stream.seek(idx)
1364   - log.debug('extracting at idx {}'.format(idx))
1365   - try:
1366   - container = VBAInfoContainer.extract_from(stream)
1367   - except Exception:
1368   - self._log_exception()
1369   - continue
1370   -
1371   - errs = container.check_validity()
1372   - if errs:
1373   - log.warning('check_validity found {} issues'
1374   - .format(len(errs)))
1375   - else:
1376   - log.debug('container is ok')
1377   - atom = container.vba_info_atom
1378   - log.debug('persist id ref is {}, has_macros {}, version {}'
1379   - .format(atom.persist_id_ref, atom.f_has_macros,
1380   - atom.version))
1381   - containers.append(container)
1382   - for err in errs:
1383   - log.warning('check_validity(VBAInfoContainer): {}'
1384   - .format(err))
1385   - if errs and self.fast_fail:
1386   - raise errs[0]
1387   -
1388   - return containers
1389 1403  
1390   - finally:
1391   - if stream is not None:
1392   - log.debug('closing stream')
1393   - stream.close()
  1404 + # try parse
  1405 + for idx in self.search_pattern(pattern):
  1406 + # assume that in stream at idx there is a VBAInfoContainer
  1407 + stream.seek(idx)
  1408 + log.debug('extracting at idx {}'.format(idx))
  1409 + try:
  1410 + container = VBAInfoContainer.extract_from(stream)
  1411 + except Exception:
  1412 + self._log_exception()
  1413 + continue
1394 1414  
1395   - def search_vba_storage(self):
  1415 + errs = container.check_validity()
  1416 + if errs:
  1417 + log.warning('check_validity found {} issues'
  1418 + .format(len(errs)))
  1419 + else:
  1420 + log.debug('container is ok')
  1421 + atom = container.vba_info_atom
  1422 + log.debug('persist id ref is {}, has_macros {}, version {}'
  1423 + .format(atom.persist_id_ref, atom.f_has_macros,
  1424 + atom.version))
  1425 + yield container
  1426 + for err in errs:
  1427 + log.warning('check_validity(VBAInfoContainer): {}'
  1428 + .format(err))
  1429 + if errs and self.fast_fail:
  1430 + raise errs[0]
  1431 +
  1432 + @generator_with_opened_main_stream
  1433 + def search_vba_storage(self, stream):
1396 1434 """ search through stream for VBAProjectStg, alternative to parse...
1397 1435  
1398 1436 quick-and-dirty: do not parse everything, just look for right bytes
... ... @@ -1403,120 +1441,113 @@ class PptParser(object):
1403 1441 The storages found could also contain (instead of VBA data): ActiveX
1404 1442 data or general OLE data
1405 1443  
  1444 + yields results as it finds them
  1445 +
1406 1446 .. seealso:: :py:meth:`search_vba_info`
1407 1447 """
1408 1448  
1409 1449 logging.debug('looking for VBA storage objects')
1410   - stream = None
1411   - try:
1412   - log.debug('opening stream')
1413   - stream = self.ole.openstream(MAIN_STREAM_NAME)
1414   -
1415   - storages = []
1416   - for obj_type in (ExternalObjectStorageUncompressed,
1417   - ExternalObjectStorageCompressed):
1418   - # re-position stream at start
1419   - stream.seek(0, os.SEEK_SET)
1420   -
1421   - # look for candidate positions
1422   - pattern = obj_type.generate_pattern()
1423   - candidates = self.search_pattern(pattern, stream)
1424   -
1425   - # try parse
1426   - for idx in candidates:
1427   - # assume a ExternalObjectStorage in stream at idx
1428   - stream.seek(idx)
1429   - log.debug('extracting at idx {}'.format(idx))
1430   - try:
1431   - storage = obj_type.extract_from(stream)
1432   - except Exception:
1433   - self._log_exception()
1434   - continue
1435   -
1436   - errs = storage.check_validity()
1437   - if errs:
1438   - log.warning('check_validity found {} issues'
1439   - .format(len(errs)))
1440   - else:
1441   - log.debug('storage is ok; compressed={}, size={}, '
1442   - 'size_decomp={}'
1443   - .format(storage.is_compressed,
1444   - storage.rec_head.rec_len,
1445   - storage.uncompressed_size))
1446   - storages.append(storage)
1447   - for err in errs:
1448   - log.warning('check_validity({}): {}'
1449   - .format(obj_type.__name__, err))
1450   - if errs and self.fast_fail:
1451   - raise errs[0]
1452   -
1453   - return storages
  1450 + for obj_type in (ExternalObjectStorageUncompressed,
  1451 + ExternalObjectStorageCompressed):
  1452 + # re-position stream at start
  1453 + stream.seek(0, os.SEEK_SET)
1454 1454  
1455   - finally:
1456   - if stream is not None:
1457   - log.debug('closing stream')
1458   - stream.close()
  1455 + pattern = obj_type.generate_pattern()
1459 1456  
  1457 + # try parse
  1458 + for idx in self.search_pattern(pattern):
  1459 + # assume a ExternalObjectStorage in stream at idx
  1460 + stream.seek(idx)
  1461 + log.debug('extracting at idx {}'.format(idx))
  1462 + try:
  1463 + storage = obj_type.extract_from(stream)
  1464 + except Exception:
  1465 + self._log_exception()
  1466 + continue
  1467 +
  1468 + errs = storage.check_validity()
  1469 + if errs:
  1470 + log.warning('check_validity found {} issues'
  1471 + .format(len(errs)))
  1472 + else:
  1473 + log.debug('storage is ok; compressed={}, size={}, '
  1474 + 'size_decomp={}'
  1475 + .format(storage.is_compressed,
  1476 + storage.rec_head.rec_len,
  1477 + storage.uncompressed_size))
  1478 + yield storage
  1479 + for err in errs:
  1480 + log.warning('check_validity({}): {}'
  1481 + .format(obj_type.__name__, err))
  1482 + if errs and self.fast_fail:
  1483 + raise errs[0]
1460 1484  
1461   - def decompress_vba_storage(self, storage):
  1485 + @with_opened_main_stream
  1486 + def decompress_vba_storage(self, stream, storage):
1462 1487 """ return decompressed data from search_vba_storage """
1463 1488  
1464 1489 log.debug('decompressing storage for VBA OLE data stream ')
1465   - stream = None
1466   - try:
1467   - log.debug('opening stream')
1468   - stream = self.ole.openstream(MAIN_STREAM_NAME)
1469   -
1470   - # decompress iteratively; a zlib.decompress of all data
1471   - # failed with Error -5 (incomplete or truncated stream)
1472   - stream.seek(storage.data_offset, os.SEEK_SET)
1473   - decomp, n_read, err = \
1474   - iterative_decompress(stream, storage.data_size)
1475   - log.debug('decompressed {} to {} bytes; found err: {}'
1476   - .format(n_read, len(decomp), err))
1477   - if err and self.fast_fail:
1478   - raise err
1479   - # otherwise try to continue with partial data
1480   -
1481   - return decomp
1482   -
1483   - ## create OleFileIO from decompressed data
1484   - #ole = olefile.OleFileIO(decomp)
1485   - #root_streams = [entry[0].lower() for entry in ole.listdir()]
1486   - #for required in 'project', 'projectwm', 'vba':
1487   - # if required not in root_streams:
1488   - # raise ValueError('storage seems to not be a VBA storage '
1489   - # '({} not found in root streams)'
1490   - # .format(required))
1491   - #log.debug('tests succeeded')
1492   - #return ole
1493   -
1494   - finally:
1495   - if stream is not None:
1496   - log.debug('closing stream')
1497   - stream.close()
1498 1490  
1499   -
1500   - def read_vba_storage_data(self, storage):
  1491 + # decompress iteratively; a zlib.decompress of all data
  1492 + # failed with Error -5 (incomplete or truncated stream)
  1493 + stream.seek(storage.data_offset, os.SEEK_SET)
  1494 + decomp, n_read, err = \
  1495 + iterative_decompress(stream, storage.data_size)
  1496 + log.debug('decompressed {} to {} bytes; found err: {}'
  1497 + .format(n_read, len(decomp), err))
  1498 + if err and self.fast_fail:
  1499 + raise err
  1500 + # otherwise try to continue with partial data
  1501 +
  1502 + return decomp
  1503 +
  1504 + ## create OleFileIO from decompressed data
  1505 + #ole = olefile.OleFileIO(decomp)
  1506 + #root_streams = [entry[0].lower() for entry in ole.listdir()]
  1507 + #for required in 'project', 'projectwm', 'vba':
  1508 + # if required not in root_streams:
  1509 + # raise ValueError('storage seems to not be a VBA storage '
  1510 + # '({} not found in root streams)'
  1511 + # .format(required))
  1512 + #log.debug('tests succeeded')
  1513 + #return ole
  1514 +
  1515 + @with_opened_main_stream
  1516 + def read_vba_storage_data(self, stream, storage):
1501 1517 """ return data pointed to by uncompressed storage """
1502 1518  
1503   - log.debug('reading uncompressed VBA OLE data stream')
1504   - stream = None
1505   - try:
1506   - log.debug('opening stream')
1507   - stream = self.ole.openstream(MAIN_STREAM_NAME)
1508   -
1509   - log.debug('reading {} bytes starting at {}'
1510   - .format(storage.data_size, storage.data_offset))
1511   - stream.seek(storage.data_offset, os.SEEK_SET)
1512   - data = stream.read(storage.data_size)
1513   -
1514   - return data
  1519 + log.debug('reading uncompressed VBA OLE data stream: '
  1520 + '{} bytes starting at {}'
  1521 + .format(storage.data_size, storage.data_offset))
  1522 + stream.seek(storage.data_offset, os.SEEK_SET)
  1523 + data = stream.read(storage.data_size)
  1524 + return data
  1525 +
  1526 + @generator_with_opened_main_stream
  1527 + def iter_vba_data(self, stream):
  1528 + """ search vba infos and storages, yield uncompressed storage data """
  1529 +
  1530 + n_infos = 0
  1531 + n_macros = 0
  1532 + for info in self.search_vba_info():
  1533 + n_infos += 1
  1534 + if info.vba_info_atom.f_has_macros > 0:
  1535 + n_macros += 1
  1536 + # TODO: does it make sense at all to continue if n_macros == 0?
  1537 + # --> no vba-info, so all storages probably ActiveX or other OLE
  1538 + n_storages = 0
  1539 + n_compressed = 0
  1540 + for storage in self.search_vba_storage():
  1541 + n_storages += 1
  1542 + if storage.is_compressed:
  1543 + n_compressed += 1
  1544 + yield self.decompress_vba_storage(storage)
  1545 + else:
  1546 + yield self.read_vba_storage_data(storage)
1515 1547  
1516   - finally:
1517   - if stream is not None:
1518   - log.debug('closing stream')
1519   - stream.close()
  1548 + log.info('found {} infos ({} with macros) and {} storages '
  1549 + '({} compressed)'
  1550 + .format(n_infos, n_macros, n_storages, n_compressed))
1520 1551  
1521 1552  
1522 1553 def iterative_decompress(stream, size, chunk_size=4096):
... ... @@ -1559,16 +1590,9 @@ def test():
1559 1590 try:
1560 1591 ppt = PptParser(file_name, fast_fail=False)
1561 1592 #ppt.parse_document_persist_object()
1562   - n_infos = len(ppt.search_vba_info())
1563   - storages = ppt.search_vba_storage()
1564   - n_storages = len(storages)
1565   - log.debug('found {} infos and {} storages'.format(n_infos,
1566   - n_storages))
1567   - if n_infos != n_storages:
1568   - log.warning('found different number of vba infos and storages')
1569   - for storage in storages:
1570   - parser = VBA_Parser(None, ppt.decompress_vba_storage(storage),
1571   - container='PptParser')
  1593 +
  1594 + for vba_data in ppt.iter_vba_data():
  1595 + parser = VBA_Parser(None, vba_data, container='PptParser')
1572 1596 for vba_root, project_path, dir_path in \
1573 1597 parser.find_vba_projects():
1574 1598 log.info('found vba project: root={}, proj={}, dir={}'
... ...