Commit 823d07b35bee7ad7705e8a0ce73d8c1ca72ce758

Authored by Christian Herdtweck
1 parent 754ae5d9

make many ppt_parser functions generators; use decorator for try-open-except-close(stream)

oletools/olevba.py
@@ -2203,27 +2203,10 @@ class VBA_Parser(object): @@ -2203,27 +2203,10 @@ class VBA_Parser(object):
2203 ppt_parser.enable_logging() 2203 ppt_parser.enable_logging()
2204 try: 2204 try:
2205 ppt = ppt_parser.PptParser(self.ole_file, fast_fail=True) 2205 ppt = ppt_parser.PptParser(self.ole_file, fast_fail=True)
2206 - info_container = ppt.search_vba_info()  
2207 - n_infos = len(info_container)  
2208 - n_macros = sum(1 for info in info_container  
2209 - if info.vba_info_atom.f_has_macros > 0)  
2210 - n_infos = len(ppt.search_vba_info())  
2211 - # TODO: does it make sense at all to continue if n_macros == 0?  
2212 - # --> no vba-info, so all storages probably ActiveX or other OLE  
2213 - storages = ppt.search_vba_storage()  
2214 - n_storages = len(storages)  
2215 - n_compressed = 0  
2216 - for storage in storages:  
2217 - if storage.is_compressed:  
2218 - storage_decomp = ppt.decompress_vba_storage(storage)  
2219 - n_compressed += 1  
2220 - else:  
2221 - storage_decomp = ppt.read_vba_storage_data(storage)  
2222 - self.ole_subfiles.append(VBA_Parser(None, storage_decomp, 2206 + for vba_data in ppt.iter_vba_data():
  2207 + self.ole_subfiles.append(VBA_Parser(None, vba_data,
2223 container='PptParser')) 2208 container='PptParser'))
2224 - log.info('File is PPT with {} vba infos ({} with macros) and {} '  
2225 - 'vba storages ({} compressed)'  
2226 - .format(n_infos, n_macros, n_storages, n_compressed)) 2209 + log.info('File is PPT')
2227 self.ole_file.close() # just in case 2210 self.ole_file.close() # just in case
2228 self.ole_file = None # required to make other methods look at ole_subfiles 2211 self.ole_file = None # required to make other methods look at ole_subfiles
2229 self.type = TYPE_PPT 2212 self.type = TYPE_PPT
oletools/ppt_parser.py
@@ -21,12 +21,12 @@ References: @@ -21,12 +21,12 @@ References:
21 # - can speed-up by using less bigger struct.parse calls? 21 # - can speed-up by using less bigger struct.parse calls?
22 # - license 22 # - license
23 # - make buffered stream from output of iterative_decompress 23 # - make buffered stream from output of iterative_decompress
24 -# - less stream open/close, possibly through decorator for open+closing? 24 +# - maybe can merge the 2 decorators into 1? (with_opened_main_stream)
25 # 25 #
26 # CHANGELOG: 26 # CHANGELOG:
27 # 2016-05-04 v0.01 CH: - start parsing "Current User" stream 27 # 2016-05-04 v0.01 CH: - start parsing "Current User" stream
28 28
29 -__version__ = '0.01' 29 +__version__ = '0.02'
30 30
31 31
32 #--- IMPORTS ------------------------------------------------------------------ 32 #--- IMPORTS ------------------------------------------------------------------
@@ -1030,6 +1030,78 @@ class ExternalObjectStorageCompressed(ExternalObjectStorage): @@ -1030,6 +1030,78 @@ class ExternalObjectStorageCompressed(ExternalObjectStorage):
1030 1030
1031 # === PptParser =============================================================== 1031 # === PptParser ===============================================================
1032 1032
  1033 +def with_opened_main_stream(func):
  1034 + """ a decorator that can open and close the default stream for func
  1035 +
  1036 + to be applied only to functions in PptParser that read from default stream
  1037 + (:py:data:`MAIN_STREAM_NAME`)
  1038 +
  1039 + Decorated functions need to accept args (self, stream, ...)
  1040 + """
  1041 +
  1042 + def wrapped(self, *args, **kwargs):
  1043 + # remember who opened the stream so that function also closes it
  1044 + stream_opened_by_me = False
  1045 + try:
  1046 + # open stream if required
  1047 + if self._open_main_stream is None:
  1048 + log.debug('opening stream {!r} for {}'
  1049 + .format(MAIN_STREAM_NAME, func.__name__))
  1050 + self._open_main_stream = self.ole.openstream(MAIN_STREAM_NAME)
  1051 + stream_opened_by_me = True
  1052 +
  1053 + # run wrapped function
  1054 + return func(self, self._open_main_stream, *args, **kwargs)
  1055 +
  1056 + # error handling
  1057 + except Exception:
  1058 + if self.fast_fail:
  1059 + raise
  1060 + else:
  1061 + self._log_exception()
  1062 + finally:
  1063 + # ensure stream is closed by the one who opened it (even if error)
  1064 + if stream_opened_by_me:
  1065 + log.debug('closing stream {!r} after {}'
  1066 + .format(MAIN_STREAM_NAME, func.__name__))
  1067 + self._open_main_stream.close()
  1068 + self._open_main_stream = None
  1069 + return wrapped
  1070 +
  1071 +
  1072 +def generator_with_opened_main_stream(func):
  1073 + """ same as with_opened_main_stream but with yield instead of return """
  1074 +
  1075 + def wrapped(self, *args, **kwargs):
  1076 + # remember who opened the stream so that function also closes it
  1077 + stream_opened_by_me = False
  1078 + try:
  1079 + # open stream if required
  1080 + if self._open_main_stream is None:
  1081 + log.debug('opening stream {!r} for {}'
  1082 + .format(MAIN_STREAM_NAME, func.__name__))
  1083 + self._open_main_stream = self.ole.openstream(MAIN_STREAM_NAME)
  1084 + stream_opened_by_me = True
  1085 +
  1086 + # run actual function
  1087 + for result in func(self, self._open_main_stream, *args, **kwargs):
  1088 + yield result
  1089 +
  1090 + # error handling
  1091 + except Exception:
  1092 + if self.fast_fail:
  1093 + raise
  1094 + else:
  1095 + self._log_exception()
  1096 + finally:
  1097 + # ensure stream is closed by the one who opened it (even if error)
  1098 + if stream_opened_by_me:
  1099 + log.debug('closing stream {!r} after {}'
  1100 + .format(MAIN_STREAM_NAME, func.__name__))
  1101 + self._open_main_stream.close()
  1102 + self._open_main_stream = None
  1103 + return wrapped
  1104 +
1033 1105
1034 class PptParser(object): 1106 class PptParser(object):
1035 """ Parser for PowerPoint 97-2003 specific data structures 1107 """ Parser for PowerPoint 97-2003 specific data structures
@@ -1074,6 +1146,8 @@ class PptParser(object): @@ -1074,6 +1146,8 @@ class PptParser(object):
1074 if not MAIN_STREAM_NAME.lower() in root_streams: 1146 if not MAIN_STREAM_NAME.lower() in root_streams:
1075 self._fail('root', 'listdir', root_streams, MAIN_STREAM_NAME) 1147 self._fail('root', 'listdir', root_streams, MAIN_STREAM_NAME)
1076 1148
  1149 + self._open_main_stream = None
  1150 +
1077 def _log_exception(self, msg=None): 1151 def _log_exception(self, msg=None):
1078 """ log an exception instead of raising it 1152 """ log an exception instead of raising it
1079 1153
@@ -1121,7 +1195,7 @@ class PptParser(object): @@ -1121,7 +1195,7 @@ class PptParser(object):
1121 1195
1122 stream = None 1196 stream = None
1123 try: 1197 try:
1124 - log.debug('opening stream') 1198 + log.debug('opening stream "Current User"')
1125 stream = self.ole.openstream('Current User') 1199 stream = self.ole.openstream('Current User')
1126 self.current_user_atom = CurrentUserAtom.extract_from(stream) 1200 self.current_user_atom = CurrentUserAtom.extract_from(stream)
1127 except Exception: 1201 except Exception:
@@ -1131,10 +1205,11 @@ class PptParser(object): @@ -1131,10 +1205,11 @@ class PptParser(object):
1131 self._log_exception() 1205 self._log_exception()
1132 finally: 1206 finally:
1133 if stream is not None: 1207 if stream is not None:
1134 - log.debug('closing stream') 1208 + log.debug('closing stream "Current User"')
1135 stream.close() 1209 stream.close()
1136 1210
1137 - def parse_persist_object_directory(self): 1211 + @with_opened_main_stream
  1212 + def parse_persist_object_directory(self, stream):
1138 """ Part 1: Construct the persist object directory """ 1213 """ Part 1: Construct the persist object directory """
1139 1214
1140 if self.persist_object_directory is not None: 1215 if self.persist_object_directory is not None:
@@ -1152,100 +1227,87 @@ class PptParser(object): @@ -1152,100 +1227,87 @@ class PptParser(object):
1152 self.persist_object_directory = {} 1227 self.persist_object_directory = {}
1153 self.newest_user_edit = None 1228 self.newest_user_edit = None
1154 1229
1155 - stream = None  
1156 - try:  
1157 - log.debug('opening stream')  
1158 - stream = self.ole.openstream(MAIN_STREAM_NAME)  
1159 -  
1160 - # Repeat steps 3 through 6 until offsetLastEdit is 0x00000000.  
1161 - while offset != 0:  
1162 -  
1163 - # Step 2: Seek, in the PowerPoint Document Stream, to the  
1164 - # offset specified by the offsetToCurrentEdit field of the  
1165 - # CurrentUserAtom record identified in step 1.  
1166 - stream.seek(offset, os.SEEK_SET)  
1167 -  
1168 - # Step 3: Read the UserEditAtom record at the current offset.  
1169 - # Let this record be a live record.  
1170 - user_edit = UserEditAtom.extract_from(stream, is_encrypted)  
1171 - if self.newest_user_edit is None:  
1172 - self.newest_user_edit = user_edit  
1173 -  
1174 - log.debug('checking validity')  
1175 - errs = user_edit.check_validity()  
1176 - if errs:  
1177 - log.warning('check_validity found {} issues'  
1178 - .format(len(errs)))  
1179 - for err in errs:  
1180 - log.warning('UserEditAtom.check_validity: {}'.format(err))  
1181 - if errs and self.fast_fail:  
1182 - raise errs[0]  
1183 -  
1184 - # Step 4: Seek to the offset specified by the  
1185 - # offsetPersistDirectory field of the UserEditAtom record  
1186 - # identified in step 3.  
1187 - log.debug('seeking to pos {}'  
1188 - .format(user_edit.offset_persist_directory))  
1189 - stream.seek(user_edit.offset_persist_directory, os.SEEK_SET)  
1190 -  
1191 - # Step 5: Read the PersistDirectoryAtom record at the current  
1192 - # offset. Let this record be a live record.  
1193 - persist_dir_atom = PersistDirectoryAtom.extract_from(stream) 1230 + # Repeat steps 3 through 6 until offsetLastEdit is 0x00000000.
  1231 + while offset != 0:
1194 1232
1195 - log.debug('checking validity')  
1196 - errs = persist_dir_atom.check_validity(offset)  
1197 - if errs:  
1198 - log.warning('check_validity found {} issues'  
1199 - .format(len(errs)))  
1200 - for err in errs:  
1201 - log.warning('PersistDirectoryAtom.check_validity: {}'  
1202 - .format(err))  
1203 - if errs and self.fast_fail:  
1204 - raise errs[0]  
1205 -  
1206 -  
1207 - # Construct the complete persist object directory for this file  
1208 - # as follows:  
1209 - # - For each PersistDirectoryAtom record previously identified  
1210 - # in step 5, add the persist object identifier and persist  
1211 - # object stream offset pairs to the persist object directory  
1212 - # starting with the PersistDirectoryAtom record last  
1213 - # identified, that is, the one closest to the beginning of the  
1214 - # stream.  
1215 - # - Continue adding these pairs to the persist object directory  
1216 - # for each PersistDirectoryAtom record in the reverse order  
1217 - # that they were identified in step 5; that is, the pairs from  
1218 - # the PersistDirectoryAtom record closest to the end of the  
1219 - # stream are added last.  
1220 - # - When adding a new pair to the persist object directory, if  
1221 - # the persist object identifier already exists in the persist  
1222 - # object directory, the persist object stream offset from the  
1223 - # new pair replaces the existing persist object stream offset  
1224 - # for that persist object identifier.  
1225 - for entry in persist_dir_atom.rg_persist_dir_entry:  
1226 - last_id = entry.persist_id+len(entry.rg_persist_offset)-1  
1227 - log.debug('for persist IDs {}-{}, save offsets {}'  
1228 - .format(entry.persist_id, last_id,  
1229 - entry.rg_persist_offset))  
1230 - for count, offset in enumerate(entry.rg_persist_offset):  
1231 - self.persist_object_directory[entry.persist_id+count] \  
1232 - = offset  
1233 -  
1234 - # check for more  
1235 - # Step 6: Seek to the offset specified by the offsetLastEdit  
1236 - # field in the UserEditAtom record identified in step 3.  
1237 - offset = user_edit.offset_last_edit  
1238 - except Exception:  
1239 - if self.fast_fail:  
1240 - raise  
1241 - else:  
1242 - self._log_exception()  
1243 - finally:  
1244 - if stream is not None:  
1245 - log.debug('closing stream')  
1246 - stream.close() 1233 + # Step 2: Seek, in the PowerPoint Document Stream, to the
  1234 + # offset specified by the offsetToCurrentEdit field of the
  1235 + # CurrentUserAtom record identified in step 1.
  1236 + stream.seek(offset, os.SEEK_SET)
1247 1237
1248 - def parse_document_persist_object(self): 1238 + # Step 3: Read the UserEditAtom record at the current offset.
  1239 + # Let this record be a live record.
  1240 + user_edit = UserEditAtom.extract_from(stream, is_encrypted)
  1241 + if self.newest_user_edit is None:
  1242 + self.newest_user_edit = user_edit
  1243 +
  1244 + log.debug('checking validity')
  1245 + errs = user_edit.check_validity()
  1246 + if errs:
  1247 + log.warning('check_validity found {} issues'
  1248 + .format(len(errs)))
  1249 + for err in errs:
  1250 + log.warning('UserEditAtom.check_validity: {}'.format(err))
  1251 + if errs and self.fast_fail:
  1252 + raise errs[0]
  1253 +
  1254 + # Step 4: Seek to the offset specified by the
  1255 + # offsetPersistDirectory field of the UserEditAtom record
  1256 + # identified in step 3.
  1257 + log.debug('seeking to pos {}'
  1258 + .format(user_edit.offset_persist_directory))
  1259 + stream.seek(user_edit.offset_persist_directory, os.SEEK_SET)
  1260 +
  1261 + # Step 5: Read the PersistDirectoryAtom record at the current
  1262 + # offset. Let this record be a live record.
  1263 + persist_dir_atom = PersistDirectoryAtom.extract_from(stream)
  1264 +
  1265 + log.debug('checking validity')
  1266 + errs = persist_dir_atom.check_validity(offset)
  1267 + if errs:
  1268 + log.warning('check_validity found {} issues'
  1269 + .format(len(errs)))
  1270 + for err in errs:
  1271 + log.warning('PersistDirectoryAtom.check_validity: {}'
  1272 + .format(err))
  1273 + if errs and self.fast_fail:
  1274 + raise errs[0]
  1275 +
  1276 +
  1277 + # Construct the complete persist object directory for this file
  1278 + # as follows:
  1279 + # - For each PersistDirectoryAtom record previously identified
  1280 + # in step 5, add the persist object identifier and persist
  1281 + # object stream offset pairs to the persist object directory
  1282 + # starting with the PersistDirectoryAtom record last
  1283 + # identified, that is, the one closest to the beginning of the
  1284 + # stream.
  1285 + # - Continue adding these pairs to the persist object directory
  1286 + # for each PersistDirectoryAtom record in the reverse order
  1287 + # that they were identified in step 5; that is, the pairs from
  1288 + # the PersistDirectoryAtom record closest to the end of the
  1289 + # stream are added last.
  1290 + # - When adding a new pair to the persist object directory, if
  1291 + # the persist object identifier already exists in the persist
  1292 + # object directory, the persist object stream offset from the
  1293 + # new pair replaces the existing persist object stream offset
  1294 + # for that persist object identifier.
  1295 + for entry in persist_dir_atom.rg_persist_dir_entry:
  1296 + last_id = entry.persist_id+len(entry.rg_persist_offset)-1
  1297 + log.debug('for persist IDs {}-{}, save offsets {}'
  1298 + .format(entry.persist_id, last_id,
  1299 + entry.rg_persist_offset))
  1300 + for count, offset in enumerate(entry.rg_persist_offset):
  1301 + self.persist_object_directory[entry.persist_id+count] \
  1302 + = offset
  1303 +
  1304 + # check for more
  1305 + # Step 6: Seek to the offset specified by the offsetLastEdit
  1306 + # field in the UserEditAtom record identified in step 3.
  1307 + offset = user_edit.offset_last_edit
  1308 +
  1309 + @with_opened_main_stream
  1310 + def parse_document_persist_object(self, stream):
1249 """ Part 2: Identify the document persist object """ 1311 """ Part 2: Identify the document persist object """
1250 if self.document_persist_obj is not None: 1312 if self.document_persist_obj is not None:
1251 log.warning('re-reading and overwriting ' 1313 log.warning('re-reading and overwriting '
@@ -1265,27 +1327,13 @@ class PptParser(object): @@ -1265,27 +1327,13 @@ class PptParser(object):
1265 log.debug('newest user edit ID is {}, offset is {}' 1327 log.debug('newest user edit ID is {}, offset is {}'
1266 .format(newest_ref, offset)) 1328 .format(newest_ref, offset))
1267 1329
1268 - stream = None 1330 + # Step 3: Seek to the stream offset specified in step 2.
  1331 + log.debug('seek to {}'.format(offset))
  1332 + stream.seek(offset, os.SEEK_SET)
1269 1333
1270 - try:  
1271 - # Step 3: Seek to the stream offset specified in step 2.  
1272 - log.debug('opening stream')  
1273 - stream = self.ole.openstream(MAIN_STREAM_NAME)  
1274 - log.debug('seek to {}'.format(offset))  
1275 - stream.seek(offset, os.SEEK_SET)  
1276 -  
1277 - # Step 4: Read the DocumentContainer record at the current offset.  
1278 - # Let this record be a live record.  
1279 - self.document_persist_obj = DocumentContainer.extract_from(stream)  
1280 - except Exception:  
1281 - if self.fast_fail:  
1282 - raise  
1283 - else:  
1284 - self._log_exception()  
1285 - finally:  
1286 - if stream is not None:  
1287 - log.debug('closing stream')  
1288 - stream.close() 1334 + # Step 4: Read the DocumentContainer record at the current offset.
  1335 + # Let this record be a live record.
  1336 + self.document_persist_obj = DocumentContainer.extract_from(stream)
1289 1337
1290 log.debug('checking validity') 1338 log.debug('checking validity')
1291 errs = self.document_persist_obj.check_validity() 1339 errs = self.document_persist_obj.check_validity()
@@ -1297,7 +1345,13 @@ class PptParser(object): @@ -1297,7 +1345,13 @@ class PptParser(object):
1297 if errs and self.fast_fail: 1345 if errs and self.fast_fail:
1298 raise errs[0] 1346 raise errs[0]
1299 1347
1300 - def search_pattern(self, pattern, stream): 1348 + #--------------------------------------------------------------------------
  1349 + # 2nd attempt: do not parse whole structure but search through stream and
  1350 + # yield results as they become available
  1351 + # Keep in mind that after every yield the stream position may be anything!
  1352 +
  1353 + @generator_with_opened_main_stream
  1354 + def search_pattern(self, stream, pattern):
1301 """ search for pattern in stream, return indices """ 1355 """ search for pattern in stream, return indices """
1302 1356
1303 BUF_SIZE = 1024 1357 BUF_SIZE = 1024
@@ -1308,30 +1362,28 @@ class PptParser(object): @@ -1308,30 +1362,28 @@ class PptParser(object):
1308 raise ValueError('need buf > pattern to search!') 1362 raise ValueError('need buf > pattern to search!')
1309 1363
1310 n_reads = 0 1364 n_reads = 0
1311 - candidates = []  
1312 while True: 1365 while True:
1313 start_pos = stream.tell() 1366 start_pos = stream.tell()
1314 n_reads += 1 1367 n_reads += 1
1315 - #log.debug('read {} starting from {}'  
1316 - # .format(BUF_SIZE, start_pos)) 1368 + log.debug('read {} starting from {}'
  1369 + .format(BUF_SIZE, start_pos))
1317 buf = stream.read(BUF_SIZE) 1370 buf = stream.read(BUF_SIZE)
1318 idx = buf.find(pattern) 1371 idx = buf.find(pattern)
1319 while idx != -1: 1372 while idx != -1:
1320 log.debug('found pattern at index {}'.format(start_pos+idx)) 1373 log.debug('found pattern at index {}'.format(start_pos+idx))
1321 - candidates.append(start_pos+idx) 1374 + yield start_pos + idx
1322 idx = buf.find(pattern, idx+1) 1375 idx = buf.find(pattern, idx+1)
1323 1376
1324 if len(buf) == BUF_SIZE: 1377 if len(buf) == BUF_SIZE:
1325 # move back a bit to avoid splitting of pattern through buf 1378 # move back a bit to avoid splitting of pattern through buf
1326 - stream.seek(-1 * pattern_len, os.SEEK_CUR) 1379 + stream.seek(start_pos + BUF_SIZE - pattern_len, os.SEEK_SET)
1327 else: 1380 else:
1328 log.debug('reached end of buf (read {}<{}) after {} reads' 1381 log.debug('reached end of buf (read {}<{}) after {} reads'
1329 .format(len(buf), BUF_SIZE, n_reads)) 1382 .format(len(buf), BUF_SIZE, n_reads))
1330 break 1383 break
1331 - return candidates  
1332 1384
1333 -  
1334 - def search_vba_info(self): 1385 + @generator_with_opened_main_stream
  1386 + def search_vba_info(self, stream):
1335 """ search through stream for VBAInfoContainer, alternative to parse... 1387 """ search through stream for VBAInfoContainer, alternative to parse...
1336 1388
1337 quick-and-dirty: do not parse everything, just look for right bytes 1389 quick-and-dirty: do not parse everything, just look for right bytes
@@ -1348,51 +1400,37 @@ class PptParser(object): @@ -1348,51 +1400,37 @@ class PptParser(object):
1348 rec_len=VBAInfoContainer.RECORD_LENGTH) \ 1400 rec_len=VBAInfoContainer.RECORD_LENGTH) \
1349 + VBAInfoAtom.generate_pattern( 1401 + VBAInfoAtom.generate_pattern(
1350 rec_len=VBAInfoAtom.RECORD_LENGTH) 1402 rec_len=VBAInfoAtom.RECORD_LENGTH)
1351 - stream = None  
1352 - try:  
1353 - log.debug('opening stream')  
1354 - stream = self.ole.openstream(MAIN_STREAM_NAME)  
1355 -  
1356 - # look for candidate positions  
1357 - candidates = self.search_pattern(pattern, stream)  
1358 -  
1359 - # try parse  
1360 - containers = []  
1361 - for idx in candidates:  
1362 - # assume that in stream at idx there is a VBAInfoContainer  
1363 - stream.seek(idx)  
1364 - log.debug('extracting at idx {}'.format(idx))  
1365 - try:  
1366 - container = VBAInfoContainer.extract_from(stream)  
1367 - except Exception:  
1368 - self._log_exception()  
1369 - continue  
1370 -  
1371 - errs = container.check_validity()  
1372 - if errs:  
1373 - log.warning('check_validity found {} issues'  
1374 - .format(len(errs)))  
1375 - else:  
1376 - log.debug('container is ok')  
1377 - atom = container.vba_info_atom  
1378 - log.debug('persist id ref is {}, has_macros {}, version {}'  
1379 - .format(atom.persist_id_ref, atom.f_has_macros,  
1380 - atom.version))  
1381 - containers.append(container)  
1382 - for err in errs:  
1383 - log.warning('check_validity(VBAInfoContainer): {}'  
1384 - .format(err))  
1385 - if errs and self.fast_fail:  
1386 - raise errs[0]  
1387 -  
1388 - return containers  
1389 1403
1390 - finally:  
1391 - if stream is not None:  
1392 - log.debug('closing stream')  
1393 - stream.close() 1404 + # try parse
  1405 + for idx in self.search_pattern(pattern):
  1406 + # assume that in stream at idx there is a VBAInfoContainer
  1407 + stream.seek(idx)
  1408 + log.debug('extracting at idx {}'.format(idx))
  1409 + try:
  1410 + container = VBAInfoContainer.extract_from(stream)
  1411 + except Exception:
  1412 + self._log_exception()
  1413 + continue
1394 1414
1395 - def search_vba_storage(self): 1415 + errs = container.check_validity()
  1416 + if errs:
  1417 + log.warning('check_validity found {} issues'
  1418 + .format(len(errs)))
  1419 + else:
  1420 + log.debug('container is ok')
  1421 + atom = container.vba_info_atom
  1422 + log.debug('persist id ref is {}, has_macros {}, version {}'
  1423 + .format(atom.persist_id_ref, atom.f_has_macros,
  1424 + atom.version))
  1425 + yield container
  1426 + for err in errs:
  1427 + log.warning('check_validity(VBAInfoContainer): {}'
  1428 + .format(err))
  1429 + if errs and self.fast_fail:
  1430 + raise errs[0]
  1431 +
  1432 + @generator_with_opened_main_stream
  1433 + def search_vba_storage(self, stream):
1396 """ search through stream for VBAProjectStg, alternative to parse... 1434 """ search through stream for VBAProjectStg, alternative to parse...
1397 1435
1398 quick-and-dirty: do not parse everything, just look for right bytes 1436 quick-and-dirty: do not parse everything, just look for right bytes
@@ -1403,120 +1441,113 @@ class PptParser(object): @@ -1403,120 +1441,113 @@ class PptParser(object):
1403 The storages found could also contain (instead of VBA data): ActiveX 1441 The storages found could also contain (instead of VBA data): ActiveX
1404 data or general OLE data 1442 data or general OLE data
1405 1443
  1444 + yields results as it finds them
  1445 +
1406 .. seealso:: :py:meth:`search_vba_info` 1446 .. seealso:: :py:meth:`search_vba_info`
1407 """ 1447 """
1408 1448
1409 logging.debug('looking for VBA storage objects') 1449 logging.debug('looking for VBA storage objects')
1410 - stream = None  
1411 - try:  
1412 - log.debug('opening stream')  
1413 - stream = self.ole.openstream(MAIN_STREAM_NAME)  
1414 -  
1415 - storages = []  
1416 - for obj_type in (ExternalObjectStorageUncompressed,  
1417 - ExternalObjectStorageCompressed):  
1418 - # re-position stream at start  
1419 - stream.seek(0, os.SEEK_SET)  
1420 -  
1421 - # look for candidate positions  
1422 - pattern = obj_type.generate_pattern()  
1423 - candidates = self.search_pattern(pattern, stream)  
1424 -  
1425 - # try parse  
1426 - for idx in candidates:  
1427 - # assume a ExternalObjectStorage in stream at idx  
1428 - stream.seek(idx)  
1429 - log.debug('extracting at idx {}'.format(idx))  
1430 - try:  
1431 - storage = obj_type.extract_from(stream)  
1432 - except Exception:  
1433 - self._log_exception()  
1434 - continue  
1435 -  
1436 - errs = storage.check_validity()  
1437 - if errs:  
1438 - log.warning('check_validity found {} issues'  
1439 - .format(len(errs)))  
1440 - else:  
1441 - log.debug('storage is ok; compressed={}, size={}, '  
1442 - 'size_decomp={}'  
1443 - .format(storage.is_compressed,  
1444 - storage.rec_head.rec_len,  
1445 - storage.uncompressed_size))  
1446 - storages.append(storage)  
1447 - for err in errs:  
1448 - log.warning('check_validity({}): {}'  
1449 - .format(obj_type.__name__, err))  
1450 - if errs and self.fast_fail:  
1451 - raise errs[0]  
1452 -  
1453 - return storages 1450 + for obj_type in (ExternalObjectStorageUncompressed,
  1451 + ExternalObjectStorageCompressed):
  1452 + # re-position stream at start
  1453 + stream.seek(0, os.SEEK_SET)
1454 1454
1455 - finally:  
1456 - if stream is not None:  
1457 - log.debug('closing stream')  
1458 - stream.close() 1455 + pattern = obj_type.generate_pattern()
1459 1456
  1457 + # try parse
  1458 + for idx in self.search_pattern(pattern):
  1459 + # assume a ExternalObjectStorage in stream at idx
  1460 + stream.seek(idx)
  1461 + log.debug('extracting at idx {}'.format(idx))
  1462 + try:
  1463 + storage = obj_type.extract_from(stream)
  1464 + except Exception:
  1465 + self._log_exception()
  1466 + continue
  1467 +
  1468 + errs = storage.check_validity()
  1469 + if errs:
  1470 + log.warning('check_validity found {} issues'
  1471 + .format(len(errs)))
  1472 + else:
  1473 + log.debug('storage is ok; compressed={}, size={}, '
  1474 + 'size_decomp={}'
  1475 + .format(storage.is_compressed,
  1476 + storage.rec_head.rec_len,
  1477 + storage.uncompressed_size))
  1478 + yield storage
  1479 + for err in errs:
  1480 + log.warning('check_validity({}): {}'
  1481 + .format(obj_type.__name__, err))
  1482 + if errs and self.fast_fail:
  1483 + raise errs[0]
1460 1484
1461 - def decompress_vba_storage(self, storage): 1485 + @with_opened_main_stream
  1486 + def decompress_vba_storage(self, stream, storage):
1462 """ return decompressed data from search_vba_storage """ 1487 """ return decompressed data from search_vba_storage """
1463 1488
1464 log.debug('decompressing storage for VBA OLE data stream ') 1489 log.debug('decompressing storage for VBA OLE data stream ')
1465 - stream = None  
1466 - try:  
1467 - log.debug('opening stream')  
1468 - stream = self.ole.openstream(MAIN_STREAM_NAME)  
1469 -  
1470 - # decompress iteratively; a zlib.decompress of all data  
1471 - # failed with Error -5 (incomplete or truncated stream)  
1472 - stream.seek(storage.data_offset, os.SEEK_SET)  
1473 - decomp, n_read, err = \  
1474 - iterative_decompress(stream, storage.data_size)  
1475 - log.debug('decompressed {} to {} bytes; found err: {}'  
1476 - .format(n_read, len(decomp), err))  
1477 - if err and self.fast_fail:  
1478 - raise err  
1479 - # otherwise try to continue with partial data  
1480 -  
1481 - return decomp  
1482 -  
1483 - ## create OleFileIO from decompressed data  
1484 - #ole = olefile.OleFileIO(decomp)  
1485 - #root_streams = [entry[0].lower() for entry in ole.listdir()]  
1486 - #for required in 'project', 'projectwm', 'vba':  
1487 - # if required not in root_streams:  
1488 - # raise ValueError('storage seems to not be a VBA storage '  
1489 - # '({} not found in root streams)'  
1490 - # .format(required))  
1491 - #log.debug('tests succeeded')  
1492 - #return ole  
1493 -  
1494 - finally:  
1495 - if stream is not None:  
1496 - log.debug('closing stream')  
1497 - stream.close()  
1498 1490
1499 -  
1500 - def read_vba_storage_data(self, storage): 1491 + # decompress iteratively; a zlib.decompress of all data
  1492 + # failed with Error -5 (incomplete or truncated stream)
  1493 + stream.seek(storage.data_offset, os.SEEK_SET)
  1494 + decomp, n_read, err = \
  1495 + iterative_decompress(stream, storage.data_size)
  1496 + log.debug('decompressed {} to {} bytes; found err: {}'
  1497 + .format(n_read, len(decomp), err))
  1498 + if err and self.fast_fail:
  1499 + raise err
  1500 + # otherwise try to continue with partial data
  1501 +
  1502 + return decomp
  1503 +
  1504 + ## create OleFileIO from decompressed data
  1505 + #ole = olefile.OleFileIO(decomp)
  1506 + #root_streams = [entry[0].lower() for entry in ole.listdir()]
  1507 + #for required in 'project', 'projectwm', 'vba':
  1508 + # if required not in root_streams:
  1509 + # raise ValueError('storage seems to not be a VBA storage '
  1510 + # '({} not found in root streams)'
  1511 + # .format(required))
  1512 + #log.debug('tests succeeded')
  1513 + #return ole
  1514 +
  1515 + @with_opened_main_stream
  1516 + def read_vba_storage_data(self, stream, storage):
1501 """ return data pointed to by uncompressed storage """ 1517 """ return data pointed to by uncompressed storage """
1502 1518
1503 - log.debug('reading uncompressed VBA OLE data stream')  
1504 - stream = None  
1505 - try:  
1506 - log.debug('opening stream')  
1507 - stream = self.ole.openstream(MAIN_STREAM_NAME)  
1508 -  
1509 - log.debug('reading {} bytes starting at {}'  
1510 - .format(storage.data_size, storage.data_offset))  
1511 - stream.seek(storage.data_offset, os.SEEK_SET)  
1512 - data = stream.read(storage.data_size)  
1513 -  
1514 - return data 1519 + log.debug('reading uncompressed VBA OLE data stream: '
  1520 + '{} bytes starting at {}'
  1521 + .format(storage.data_size, storage.data_offset))
  1522 + stream.seek(storage.data_offset, os.SEEK_SET)
  1523 + data = stream.read(storage.data_size)
  1524 + return data
  1525 +
  1526 + @generator_with_opened_main_stream
  1527 + def iter_vba_data(self, stream):
  1528 + """ search vba infos and storages, yield uncompressed storage data """
  1529 +
  1530 + n_infos = 0
  1531 + n_macros = 0
  1532 + for info in self.search_vba_info():
  1533 + n_infos += 1
  1534 + if info.vba_info_atom.f_has_macros > 0:
  1535 + n_macros += 1
  1536 + # TODO: does it make sense at all to continue if n_macros == 0?
  1537 + # --> no vba-info, so all storages probably ActiveX or other OLE
  1538 + n_storages = 0
  1539 + n_compressed = 0
  1540 + for storage in self.search_vba_storage():
  1541 + n_storages += 1
  1542 + if storage.is_compressed:
  1543 + n_compressed += 1
  1544 + yield self.decompress_vba_storage(storage)
  1545 + else:
  1546 + yield self.read_vba_storage_data(storage)
1515 1547
1516 - finally:  
1517 - if stream is not None:  
1518 - log.debug('closing stream')  
1519 - stream.close() 1548 + log.info('found {} infos ({} with macros) and {} storages '
  1549 + '({} compressed)'
  1550 + .format(n_infos, n_macros, n_storages, n_compressed))
1520 1551
1521 1552
1522 def iterative_decompress(stream, size, chunk_size=4096): 1553 def iterative_decompress(stream, size, chunk_size=4096):
@@ -1559,16 +1590,9 @@ def test(): @@ -1559,16 +1590,9 @@ def test():
1559 try: 1590 try:
1560 ppt = PptParser(file_name, fast_fail=False) 1591 ppt = PptParser(file_name, fast_fail=False)
1561 #ppt.parse_document_persist_object() 1592 #ppt.parse_document_persist_object()
1562 - n_infos = len(ppt.search_vba_info())  
1563 - storages = ppt.search_vba_storage()  
1564 - n_storages = len(storages)  
1565 - log.debug('found {} infos and {} storages'.format(n_infos,  
1566 - n_storages))  
1567 - if n_infos != n_storages:  
1568 - log.warning('found different number of vba infos and storages')  
1569 - for storage in storages:  
1570 - parser = VBA_Parser(None, ppt.decompress_vba_storage(storage),  
1571 - container='PptParser') 1593 +
  1594 + for vba_data in ppt.iter_vba_data():
  1595 + parser = VBA_Parser(None, vba_data, container='PptParser')
1572 for vba_root, project_path, dir_path in \ 1596 for vba_root, project_path, dir_path in \
1573 parser.find_vba_projects(): 1597 parser.find_vba_projects():
1574 log.info('found vba project: root={}, proj={}, dir={}' 1598 log.info('found vba project: root={}, proj={}, dir={}'