Commit 823d07b35bee7ad7705e8a0ce73d8c1ca72ce758
1 parent
754ae5d9
make many ppt_parser functions generators; use decorator for try-open-except-close(stream)
Showing
2 changed files
with
306 additions
and
299 deletions
oletools/olevba.py
| ... | ... | @@ -2203,27 +2203,10 @@ class VBA_Parser(object): |
| 2203 | 2203 | ppt_parser.enable_logging() |
| 2204 | 2204 | try: |
| 2205 | 2205 | ppt = ppt_parser.PptParser(self.ole_file, fast_fail=True) |
| 2206 | - info_container = ppt.search_vba_info() | |
| 2207 | - n_infos = len(info_container) | |
| 2208 | - n_macros = sum(1 for info in info_container | |
| 2209 | - if info.vba_info_atom.f_has_macros > 0) | |
| 2210 | - n_infos = len(ppt.search_vba_info()) | |
| 2211 | - # TODO: does it make sense at all to continue if n_macros == 0? | |
| 2212 | - # --> no vba-info, so all storages probably ActiveX or other OLE | |
| 2213 | - storages = ppt.search_vba_storage() | |
| 2214 | - n_storages = len(storages) | |
| 2215 | - n_compressed = 0 | |
| 2216 | - for storage in storages: | |
| 2217 | - if storage.is_compressed: | |
| 2218 | - storage_decomp = ppt.decompress_vba_storage(storage) | |
| 2219 | - n_compressed += 1 | |
| 2220 | - else: | |
| 2221 | - storage_decomp = ppt.read_vba_storage_data(storage) | |
| 2222 | - self.ole_subfiles.append(VBA_Parser(None, storage_decomp, | |
| 2206 | + for vba_data in ppt.iter_vba_data(): | |
| 2207 | + self.ole_subfiles.append(VBA_Parser(None, vba_data, | |
| 2223 | 2208 | container='PptParser')) |
| 2224 | - log.info('File is PPT with {} vba infos ({} with macros) and {} ' | |
| 2225 | - 'vba storages ({} compressed)' | |
| 2226 | - .format(n_infos, n_macros, n_storages, n_compressed)) | |
| 2209 | + log.info('File is PPT') | |
| 2227 | 2210 | self.ole_file.close() # just in case |
| 2228 | 2211 | self.ole_file = None # required to make other methods look at ole_subfiles |
| 2229 | 2212 | self.type = TYPE_PPT | ... | ... |
oletools/ppt_parser.py
| ... | ... | @@ -21,12 +21,12 @@ References: |
| 21 | 21 | # - can speed-up by using less bigger struct.parse calls? |
| 22 | 22 | # - license |
| 23 | 23 | # - make buffered stream from output of iterative_decompress |
| 24 | -# - less stream open/close, possibly through decorator for open+closing? | |
| 24 | +# - maybe can merge the 2 decorators into 1? (with_opened_main_stream) | |
| 25 | 25 | # |
| 26 | 26 | # CHANGELOG: |
| 27 | 27 | # 2016-05-04 v0.01 CH: - start parsing "Current User" stream |
| 28 | 28 | |
| 29 | -__version__ = '0.01' | |
| 29 | +__version__ = '0.02' | |
| 30 | 30 | |
| 31 | 31 | |
| 32 | 32 | #--- IMPORTS ------------------------------------------------------------------ |
| ... | ... | @@ -1030,6 +1030,78 @@ class ExternalObjectStorageCompressed(ExternalObjectStorage): |
| 1030 | 1030 | |
| 1031 | 1031 | # === PptParser =============================================================== |
| 1032 | 1032 | |
| 1033 | +def with_opened_main_stream(func): | |
| 1034 | + """ a decorator that can open and close the default stream for func | |
| 1035 | + | |
| 1036 | + to be applied only to functions in PptParser that read from default stream | |
| 1037 | + (:py:data:`MAIN_STREAM_NAME`) | |
| 1038 | + | |
| 1039 | + Decorated functions need to accept args (self, stream, ...) | |
| 1040 | + """ | |
| 1041 | + | |
| 1042 | + def wrapped(self, *args, **kwargs): | |
| 1043 | + # remember who opened the stream so that function also closes it | |
| 1044 | + stream_opened_by_me = False | |
| 1045 | + try: | |
| 1046 | + # open stream if required | |
| 1047 | + if self._open_main_stream is None: | |
| 1048 | + log.debug('opening stream {!r} for {}' | |
| 1049 | + .format(MAIN_STREAM_NAME, func.__name__)) | |
| 1050 | + self._open_main_stream = self.ole.openstream(MAIN_STREAM_NAME) | |
| 1051 | + stream_opened_by_me = True | |
| 1052 | + | |
| 1053 | + # run wrapped function | |
| 1054 | + return func(self, self._open_main_stream, *args, **kwargs) | |
| 1055 | + | |
| 1056 | + # error handling | |
| 1057 | + except Exception: | |
| 1058 | + if self.fast_fail: | |
| 1059 | + raise | |
| 1060 | + else: | |
| 1061 | + self._log_exception() | |
| 1062 | + finally: | |
| 1063 | + # ensure stream is closed by the one who opened it (even if error) | |
| 1064 | + if stream_opened_by_me: | |
| 1065 | + log.debug('closing stream {!r} after {}' | |
| 1066 | + .format(MAIN_STREAM_NAME, func.__name__)) | |
| 1067 | + self._open_main_stream.close() | |
| 1068 | + self._open_main_stream = None | |
| 1069 | + return wrapped | |
| 1070 | + | |
| 1071 | + | |
| 1072 | +def generator_with_opened_main_stream(func): | |
| 1073 | + """ same as with_opened_main_stream but with yield instead of return """ | |
| 1074 | + | |
| 1075 | + def wrapped(self, *args, **kwargs): | |
| 1076 | + # remember who opened the stream so that function also closes it | |
| 1077 | + stream_opened_by_me = False | |
| 1078 | + try: | |
| 1079 | + # open stream if required | |
| 1080 | + if self._open_main_stream is None: | |
| 1081 | + log.debug('opening stream {!r} for {}' | |
| 1082 | + .format(MAIN_STREAM_NAME, func.__name__)) | |
| 1083 | + self._open_main_stream = self.ole.openstream(MAIN_STREAM_NAME) | |
| 1084 | + stream_opened_by_me = True | |
| 1085 | + | |
| 1086 | + # run actual function | |
| 1087 | + for result in func(self, self._open_main_stream, *args, **kwargs): | |
| 1088 | + yield result | |
| 1089 | + | |
| 1090 | + # error handling | |
| 1091 | + except Exception: | |
| 1092 | + if self.fast_fail: | |
| 1093 | + raise | |
| 1094 | + else: | |
| 1095 | + self._log_exception() | |
| 1096 | + finally: | |
| 1097 | + # ensure stream is closed by the one who opened it (even if error) | |
| 1098 | + if stream_opened_by_me: | |
| 1099 | + log.debug('closing stream {!r} after {}' | |
| 1100 | + .format(MAIN_STREAM_NAME, func.__name__)) | |
| 1101 | + self._open_main_stream.close() | |
| 1102 | + self._open_main_stream = None | |
| 1103 | + return wrapped | |
| 1104 | + | |
| 1033 | 1105 | |
| 1034 | 1106 | class PptParser(object): |
| 1035 | 1107 | """ Parser for PowerPoint 97-2003 specific data structures |
| ... | ... | @@ -1074,6 +1146,8 @@ class PptParser(object): |
| 1074 | 1146 | if not MAIN_STREAM_NAME.lower() in root_streams: |
| 1075 | 1147 | self._fail('root', 'listdir', root_streams, MAIN_STREAM_NAME) |
| 1076 | 1148 | |
| 1149 | + self._open_main_stream = None | |
| 1150 | + | |
| 1077 | 1151 | def _log_exception(self, msg=None): |
| 1078 | 1152 | """ log an exception instead of raising it |
| 1079 | 1153 | |
| ... | ... | @@ -1121,7 +1195,7 @@ class PptParser(object): |
| 1121 | 1195 | |
| 1122 | 1196 | stream = None |
| 1123 | 1197 | try: |
| 1124 | - log.debug('opening stream') | |
| 1198 | + log.debug('opening stream "Current User"') | |
| 1125 | 1199 | stream = self.ole.openstream('Current User') |
| 1126 | 1200 | self.current_user_atom = CurrentUserAtom.extract_from(stream) |
| 1127 | 1201 | except Exception: |
| ... | ... | @@ -1131,10 +1205,11 @@ class PptParser(object): |
| 1131 | 1205 | self._log_exception() |
| 1132 | 1206 | finally: |
| 1133 | 1207 | if stream is not None: |
| 1134 | - log.debug('closing stream') | |
| 1208 | + log.debug('closing stream "Current User"') | |
| 1135 | 1209 | stream.close() |
| 1136 | 1210 | |
| 1137 | - def parse_persist_object_directory(self): | |
| 1211 | + @with_opened_main_stream | |
| 1212 | + def parse_persist_object_directory(self, stream): | |
| 1138 | 1213 | """ Part 1: Construct the persist object directory """ |
| 1139 | 1214 | |
| 1140 | 1215 | if self.persist_object_directory is not None: |
| ... | ... | @@ -1152,100 +1227,87 @@ class PptParser(object): |
| 1152 | 1227 | self.persist_object_directory = {} |
| 1153 | 1228 | self.newest_user_edit = None |
| 1154 | 1229 | |
| 1155 | - stream = None | |
| 1156 | - try: | |
| 1157 | - log.debug('opening stream') | |
| 1158 | - stream = self.ole.openstream(MAIN_STREAM_NAME) | |
| 1159 | - | |
| 1160 | - # Repeat steps 3 through 6 until offsetLastEdit is 0x00000000. | |
| 1161 | - while offset != 0: | |
| 1162 | - | |
| 1163 | - # Step 2: Seek, in the PowerPoint Document Stream, to the | |
| 1164 | - # offset specified by the offsetToCurrentEdit field of the | |
| 1165 | - # CurrentUserAtom record identified in step 1. | |
| 1166 | - stream.seek(offset, os.SEEK_SET) | |
| 1167 | - | |
| 1168 | - # Step 3: Read the UserEditAtom record at the current offset. | |
| 1169 | - # Let this record be a live record. | |
| 1170 | - user_edit = UserEditAtom.extract_from(stream, is_encrypted) | |
| 1171 | - if self.newest_user_edit is None: | |
| 1172 | - self.newest_user_edit = user_edit | |
| 1173 | - | |
| 1174 | - log.debug('checking validity') | |
| 1175 | - errs = user_edit.check_validity() | |
| 1176 | - if errs: | |
| 1177 | - log.warning('check_validity found {} issues' | |
| 1178 | - .format(len(errs))) | |
| 1179 | - for err in errs: | |
| 1180 | - log.warning('UserEditAtom.check_validity: {}'.format(err)) | |
| 1181 | - if errs and self.fast_fail: | |
| 1182 | - raise errs[0] | |
| 1183 | - | |
| 1184 | - # Step 4: Seek to the offset specified by the | |
| 1185 | - # offsetPersistDirectory field of the UserEditAtom record | |
| 1186 | - # identified in step 3. | |
| 1187 | - log.debug('seeking to pos {}' | |
| 1188 | - .format(user_edit.offset_persist_directory)) | |
| 1189 | - stream.seek(user_edit.offset_persist_directory, os.SEEK_SET) | |
| 1190 | - | |
| 1191 | - # Step 5: Read the PersistDirectoryAtom record at the current | |
| 1192 | - # offset. Let this record be a live record. | |
| 1193 | - persist_dir_atom = PersistDirectoryAtom.extract_from(stream) | |
| 1230 | + # Repeat steps 3 through 6 until offsetLastEdit is 0x00000000. | |
| 1231 | + while offset != 0: | |
| 1194 | 1232 | |
| 1195 | - log.debug('checking validity') | |
| 1196 | - errs = persist_dir_atom.check_validity(offset) | |
| 1197 | - if errs: | |
| 1198 | - log.warning('check_validity found {} issues' | |
| 1199 | - .format(len(errs))) | |
| 1200 | - for err in errs: | |
| 1201 | - log.warning('PersistDirectoryAtom.check_validity: {}' | |
| 1202 | - .format(err)) | |
| 1203 | - if errs and self.fast_fail: | |
| 1204 | - raise errs[0] | |
| 1205 | - | |
| 1206 | - | |
| 1207 | - # Construct the complete persist object directory for this file | |
| 1208 | - # as follows: | |
| 1209 | - # - For each PersistDirectoryAtom record previously identified | |
| 1210 | - # in step 5, add the persist object identifier and persist | |
| 1211 | - # object stream offset pairs to the persist object directory | |
| 1212 | - # starting with the PersistDirectoryAtom record last | |
| 1213 | - # identified, that is, the one closest to the beginning of the | |
| 1214 | - # stream. | |
| 1215 | - # - Continue adding these pairs to the persist object directory | |
| 1216 | - # for each PersistDirectoryAtom record in the reverse order | |
| 1217 | - # that they were identified in step 5; that is, the pairs from | |
| 1218 | - # the PersistDirectoryAtom record closest to the end of the | |
| 1219 | - # stream are added last. | |
| 1220 | - # - When adding a new pair to the persist object directory, if | |
| 1221 | - # the persist object identifier already exists in the persist | |
| 1222 | - # object directory, the persist object stream offset from the | |
| 1223 | - # new pair replaces the existing persist object stream offset | |
| 1224 | - # for that persist object identifier. | |
| 1225 | - for entry in persist_dir_atom.rg_persist_dir_entry: | |
| 1226 | - last_id = entry.persist_id+len(entry.rg_persist_offset)-1 | |
| 1227 | - log.debug('for persist IDs {}-{}, save offsets {}' | |
| 1228 | - .format(entry.persist_id, last_id, | |
| 1229 | - entry.rg_persist_offset)) | |
| 1230 | - for count, offset in enumerate(entry.rg_persist_offset): | |
| 1231 | - self.persist_object_directory[entry.persist_id+count] \ | |
| 1232 | - = offset | |
| 1233 | - | |
| 1234 | - # check for more | |
| 1235 | - # Step 6: Seek to the offset specified by the offsetLastEdit | |
| 1236 | - # field in the UserEditAtom record identified in step 3. | |
| 1237 | - offset = user_edit.offset_last_edit | |
| 1238 | - except Exception: | |
| 1239 | - if self.fast_fail: | |
| 1240 | - raise | |
| 1241 | - else: | |
| 1242 | - self._log_exception() | |
| 1243 | - finally: | |
| 1244 | - if stream is not None: | |
| 1245 | - log.debug('closing stream') | |
| 1246 | - stream.close() | |
| 1233 | + # Step 2: Seek, in the PowerPoint Document Stream, to the | |
| 1234 | + # offset specified by the offsetToCurrentEdit field of the | |
| 1235 | + # CurrentUserAtom record identified in step 1. | |
| 1236 | + stream.seek(offset, os.SEEK_SET) | |
| 1247 | 1237 | |
| 1248 | - def parse_document_persist_object(self): | |
| 1238 | + # Step 3: Read the UserEditAtom record at the current offset. | |
| 1239 | + # Let this record be a live record. | |
| 1240 | + user_edit = UserEditAtom.extract_from(stream, is_encrypted) | |
| 1241 | + if self.newest_user_edit is None: | |
| 1242 | + self.newest_user_edit = user_edit | |
| 1243 | + | |
| 1244 | + log.debug('checking validity') | |
| 1245 | + errs = user_edit.check_validity() | |
| 1246 | + if errs: | |
| 1247 | + log.warning('check_validity found {} issues' | |
| 1248 | + .format(len(errs))) | |
| 1249 | + for err in errs: | |
| 1250 | + log.warning('UserEditAtom.check_validity: {}'.format(err)) | |
| 1251 | + if errs and self.fast_fail: | |
| 1252 | + raise errs[0] | |
| 1253 | + | |
| 1254 | + # Step 4: Seek to the offset specified by the | |
| 1255 | + # offsetPersistDirectory field of the UserEditAtom record | |
| 1256 | + # identified in step 3. | |
| 1257 | + log.debug('seeking to pos {}' | |
| 1258 | + .format(user_edit.offset_persist_directory)) | |
| 1259 | + stream.seek(user_edit.offset_persist_directory, os.SEEK_SET) | |
| 1260 | + | |
| 1261 | + # Step 5: Read the PersistDirectoryAtom record at the current | |
| 1262 | + # offset. Let this record be a live record. | |
| 1263 | + persist_dir_atom = PersistDirectoryAtom.extract_from(stream) | |
| 1264 | + | |
| 1265 | + log.debug('checking validity') | |
| 1266 | + errs = persist_dir_atom.check_validity(offset) | |
| 1267 | + if errs: | |
| 1268 | + log.warning('check_validity found {} issues' | |
| 1269 | + .format(len(errs))) | |
| 1270 | + for err in errs: | |
| 1271 | + log.warning('PersistDirectoryAtom.check_validity: {}' | |
| 1272 | + .format(err)) | |
| 1273 | + if errs and self.fast_fail: | |
| 1274 | + raise errs[0] | |
| 1275 | + | |
| 1276 | + | |
| 1277 | + # Construct the complete persist object directory for this file | |
| 1278 | + # as follows: | |
| 1279 | + # - For each PersistDirectoryAtom record previously identified | |
| 1280 | + # in step 5, add the persist object identifier and persist | |
| 1281 | + # object stream offset pairs to the persist object directory | |
| 1282 | + # starting with the PersistDirectoryAtom record last | |
| 1283 | + # identified, that is, the one closest to the beginning of the | |
| 1284 | + # stream. | |
| 1285 | + # - Continue adding these pairs to the persist object directory | |
| 1286 | + # for each PersistDirectoryAtom record in the reverse order | |
| 1287 | + # that they were identified in step 5; that is, the pairs from | |
| 1288 | + # the PersistDirectoryAtom record closest to the end of the | |
| 1289 | + # stream are added last. | |
| 1290 | + # - When adding a new pair to the persist object directory, if | |
| 1291 | + # the persist object identifier already exists in the persist | |
| 1292 | + # object directory, the persist object stream offset from the | |
| 1293 | + # new pair replaces the existing persist object stream offset | |
| 1294 | + # for that persist object identifier. | |
| 1295 | + for entry in persist_dir_atom.rg_persist_dir_entry: | |
| 1296 | + last_id = entry.persist_id+len(entry.rg_persist_offset)-1 | |
| 1297 | + log.debug('for persist IDs {}-{}, save offsets {}' | |
| 1298 | + .format(entry.persist_id, last_id, | |
| 1299 | + entry.rg_persist_offset)) | |
| 1300 | + for count, offset in enumerate(entry.rg_persist_offset): | |
| 1301 | + self.persist_object_directory[entry.persist_id+count] \ | |
| 1302 | + = offset | |
| 1303 | + | |
| 1304 | + # check for more | |
| 1305 | + # Step 6: Seek to the offset specified by the offsetLastEdit | |
| 1306 | + # field in the UserEditAtom record identified in step 3. | |
| 1307 | + offset = user_edit.offset_last_edit | |
| 1308 | + | |
| 1309 | + @with_opened_main_stream | |
| 1310 | + def parse_document_persist_object(self, stream): | |
| 1249 | 1311 | """ Part 2: Identify the document persist object """ |
| 1250 | 1312 | if self.document_persist_obj is not None: |
| 1251 | 1313 | log.warning('re-reading and overwriting ' |
| ... | ... | @@ -1265,27 +1327,13 @@ class PptParser(object): |
| 1265 | 1327 | log.debug('newest user edit ID is {}, offset is {}' |
| 1266 | 1328 | .format(newest_ref, offset)) |
| 1267 | 1329 | |
| 1268 | - stream = None | |
| 1330 | + # Step 3: Seek to the stream offset specified in step 2. | |
| 1331 | + log.debug('seek to {}'.format(offset)) | |
| 1332 | + stream.seek(offset, os.SEEK_SET) | |
| 1269 | 1333 | |
| 1270 | - try: | |
| 1271 | - # Step 3: Seek to the stream offset specified in step 2. | |
| 1272 | - log.debug('opening stream') | |
| 1273 | - stream = self.ole.openstream(MAIN_STREAM_NAME) | |
| 1274 | - log.debug('seek to {}'.format(offset)) | |
| 1275 | - stream.seek(offset, os.SEEK_SET) | |
| 1276 | - | |
| 1277 | - # Step 4: Read the DocumentContainer record at the current offset. | |
| 1278 | - # Let this record be a live record. | |
| 1279 | - self.document_persist_obj = DocumentContainer.extract_from(stream) | |
| 1280 | - except Exception: | |
| 1281 | - if self.fast_fail: | |
| 1282 | - raise | |
| 1283 | - else: | |
| 1284 | - self._log_exception() | |
| 1285 | - finally: | |
| 1286 | - if stream is not None: | |
| 1287 | - log.debug('closing stream') | |
| 1288 | - stream.close() | |
| 1334 | + # Step 4: Read the DocumentContainer record at the current offset. | |
| 1335 | + # Let this record be a live record. | |
| 1336 | + self.document_persist_obj = DocumentContainer.extract_from(stream) | |
| 1289 | 1337 | |
| 1290 | 1338 | log.debug('checking validity') |
| 1291 | 1339 | errs = self.document_persist_obj.check_validity() |
| ... | ... | @@ -1297,7 +1345,13 @@ class PptParser(object): |
| 1297 | 1345 | if errs and self.fast_fail: |
| 1298 | 1346 | raise errs[0] |
| 1299 | 1347 | |
| 1300 | - def search_pattern(self, pattern, stream): | |
| 1348 | + #-------------------------------------------------------------------------- | |
| 1349 | + # 2nd attempt: do not parse whole structure but search through stream and | |
| 1350 | + # yield results as they become available | |
| 1351 | + # Keep in mind that after every yield the stream position may be anything! | |
| 1352 | + | |
| 1353 | + @generator_with_opened_main_stream | |
| 1354 | + def search_pattern(self, stream, pattern): | |
| 1301 | 1355 | """ search for pattern in stream, return indices """ |
| 1302 | 1356 | |
| 1303 | 1357 | BUF_SIZE = 1024 |
| ... | ... | @@ -1308,30 +1362,28 @@ class PptParser(object): |
| 1308 | 1362 | raise ValueError('need buf > pattern to search!') |
| 1309 | 1363 | |
| 1310 | 1364 | n_reads = 0 |
| 1311 | - candidates = [] | |
| 1312 | 1365 | while True: |
| 1313 | 1366 | start_pos = stream.tell() |
| 1314 | 1367 | n_reads += 1 |
| 1315 | - #log.debug('read {} starting from {}' | |
| 1316 | - # .format(BUF_SIZE, start_pos)) | |
| 1368 | + log.debug('read {} starting from {}' | |
| 1369 | + .format(BUF_SIZE, start_pos)) | |
| 1317 | 1370 | buf = stream.read(BUF_SIZE) |
| 1318 | 1371 | idx = buf.find(pattern) |
| 1319 | 1372 | while idx != -1: |
| 1320 | 1373 | log.debug('found pattern at index {}'.format(start_pos+idx)) |
| 1321 | - candidates.append(start_pos+idx) | |
| 1374 | + yield start_pos + idx | |
| 1322 | 1375 | idx = buf.find(pattern, idx+1) |
| 1323 | 1376 | |
| 1324 | 1377 | if len(buf) == BUF_SIZE: |
| 1325 | 1378 | # move back a bit to avoid splitting of pattern through buf |
| 1326 | - stream.seek(-1 * pattern_len, os.SEEK_CUR) | |
| 1379 | + stream.seek(start_pos + BUF_SIZE - pattern_len, os.SEEK_SET) | |
| 1327 | 1380 | else: |
| 1328 | 1381 | log.debug('reached end of buf (read {}<{}) after {} reads' |
| 1329 | 1382 | .format(len(buf), BUF_SIZE, n_reads)) |
| 1330 | 1383 | break |
| 1331 | - return candidates | |
| 1332 | 1384 | |
| 1333 | - | |
| 1334 | - def search_vba_info(self): | |
| 1385 | + @generator_with_opened_main_stream | |
| 1386 | + def search_vba_info(self, stream): | |
| 1335 | 1387 | """ search through stream for VBAInfoContainer, alternative to parse... |
| 1336 | 1388 | |
| 1337 | 1389 | quick-and-dirty: do not parse everything, just look for right bytes |
| ... | ... | @@ -1348,51 +1400,37 @@ class PptParser(object): |
| 1348 | 1400 | rec_len=VBAInfoContainer.RECORD_LENGTH) \ |
| 1349 | 1401 | + VBAInfoAtom.generate_pattern( |
| 1350 | 1402 | rec_len=VBAInfoAtom.RECORD_LENGTH) |
| 1351 | - stream = None | |
| 1352 | - try: | |
| 1353 | - log.debug('opening stream') | |
| 1354 | - stream = self.ole.openstream(MAIN_STREAM_NAME) | |
| 1355 | - | |
| 1356 | - # look for candidate positions | |
| 1357 | - candidates = self.search_pattern(pattern, stream) | |
| 1358 | - | |
| 1359 | - # try parse | |
| 1360 | - containers = [] | |
| 1361 | - for idx in candidates: | |
| 1362 | - # assume that in stream at idx there is a VBAInfoContainer | |
| 1363 | - stream.seek(idx) | |
| 1364 | - log.debug('extracting at idx {}'.format(idx)) | |
| 1365 | - try: | |
| 1366 | - container = VBAInfoContainer.extract_from(stream) | |
| 1367 | - except Exception: | |
| 1368 | - self._log_exception() | |
| 1369 | - continue | |
| 1370 | - | |
| 1371 | - errs = container.check_validity() | |
| 1372 | - if errs: | |
| 1373 | - log.warning('check_validity found {} issues' | |
| 1374 | - .format(len(errs))) | |
| 1375 | - else: | |
| 1376 | - log.debug('container is ok') | |
| 1377 | - atom = container.vba_info_atom | |
| 1378 | - log.debug('persist id ref is {}, has_macros {}, version {}' | |
| 1379 | - .format(atom.persist_id_ref, atom.f_has_macros, | |
| 1380 | - atom.version)) | |
| 1381 | - containers.append(container) | |
| 1382 | - for err in errs: | |
| 1383 | - log.warning('check_validity(VBAInfoContainer): {}' | |
| 1384 | - .format(err)) | |
| 1385 | - if errs and self.fast_fail: | |
| 1386 | - raise errs[0] | |
| 1387 | - | |
| 1388 | - return containers | |
| 1389 | 1403 | |
| 1390 | - finally: | |
| 1391 | - if stream is not None: | |
| 1392 | - log.debug('closing stream') | |
| 1393 | - stream.close() | |
| 1404 | + # try parse | |
| 1405 | + for idx in self.search_pattern(pattern): | |
| 1406 | + # assume that in stream at idx there is a VBAInfoContainer | |
| 1407 | + stream.seek(idx) | |
| 1408 | + log.debug('extracting at idx {}'.format(idx)) | |
| 1409 | + try: | |
| 1410 | + container = VBAInfoContainer.extract_from(stream) | |
| 1411 | + except Exception: | |
| 1412 | + self._log_exception() | |
| 1413 | + continue | |
| 1394 | 1414 | |
| 1395 | - def search_vba_storage(self): | |
| 1415 | + errs = container.check_validity() | |
| 1416 | + if errs: | |
| 1417 | + log.warning('check_validity found {} issues' | |
| 1418 | + .format(len(errs))) | |
| 1419 | + else: | |
| 1420 | + log.debug('container is ok') | |
| 1421 | + atom = container.vba_info_atom | |
| 1422 | + log.debug('persist id ref is {}, has_macros {}, version {}' | |
| 1423 | + .format(atom.persist_id_ref, atom.f_has_macros, | |
| 1424 | + atom.version)) | |
| 1425 | + yield container | |
| 1426 | + for err in errs: | |
| 1427 | + log.warning('check_validity(VBAInfoContainer): {}' | |
| 1428 | + .format(err)) | |
| 1429 | + if errs and self.fast_fail: | |
| 1430 | + raise errs[0] | |
| 1431 | + | |
| 1432 | + @generator_with_opened_main_stream | |
| 1433 | + def search_vba_storage(self, stream): | |
| 1396 | 1434 | """ search through stream for VBAProjectStg, alternative to parse... |
| 1397 | 1435 | |
| 1398 | 1436 | quick-and-dirty: do not parse everything, just look for right bytes |
| ... | ... | @@ -1403,120 +1441,113 @@ class PptParser(object): |
| 1403 | 1441 | The storages found could also contain (instead of VBA data): ActiveX |
| 1404 | 1442 | data or general OLE data |
| 1405 | 1443 | |
| 1444 | + yields results as it finds them | |
| 1445 | + | |
| 1406 | 1446 | .. seealso:: :py:meth:`search_vba_info` |
| 1407 | 1447 | """ |
| 1408 | 1448 | |
| 1409 | 1449 | logging.debug('looking for VBA storage objects') |
| 1410 | - stream = None | |
| 1411 | - try: | |
| 1412 | - log.debug('opening stream') | |
| 1413 | - stream = self.ole.openstream(MAIN_STREAM_NAME) | |
| 1414 | - | |
| 1415 | - storages = [] | |
| 1416 | - for obj_type in (ExternalObjectStorageUncompressed, | |
| 1417 | - ExternalObjectStorageCompressed): | |
| 1418 | - # re-position stream at start | |
| 1419 | - stream.seek(0, os.SEEK_SET) | |
| 1420 | - | |
| 1421 | - # look for candidate positions | |
| 1422 | - pattern = obj_type.generate_pattern() | |
| 1423 | - candidates = self.search_pattern(pattern, stream) | |
| 1424 | - | |
| 1425 | - # try parse | |
| 1426 | - for idx in candidates: | |
| 1427 | - # assume a ExternalObjectStorage in stream at idx | |
| 1428 | - stream.seek(idx) | |
| 1429 | - log.debug('extracting at idx {}'.format(idx)) | |
| 1430 | - try: | |
| 1431 | - storage = obj_type.extract_from(stream) | |
| 1432 | - except Exception: | |
| 1433 | - self._log_exception() | |
| 1434 | - continue | |
| 1435 | - | |
| 1436 | - errs = storage.check_validity() | |
| 1437 | - if errs: | |
| 1438 | - log.warning('check_validity found {} issues' | |
| 1439 | - .format(len(errs))) | |
| 1440 | - else: | |
| 1441 | - log.debug('storage is ok; compressed={}, size={}, ' | |
| 1442 | - 'size_decomp={}' | |
| 1443 | - .format(storage.is_compressed, | |
| 1444 | - storage.rec_head.rec_len, | |
| 1445 | - storage.uncompressed_size)) | |
| 1446 | - storages.append(storage) | |
| 1447 | - for err in errs: | |
| 1448 | - log.warning('check_validity({}): {}' | |
| 1449 | - .format(obj_type.__name__, err)) | |
| 1450 | - if errs and self.fast_fail: | |
| 1451 | - raise errs[0] | |
| 1452 | - | |
| 1453 | - return storages | |
| 1450 | + for obj_type in (ExternalObjectStorageUncompressed, | |
| 1451 | + ExternalObjectStorageCompressed): | |
| 1452 | + # re-position stream at start | |
| 1453 | + stream.seek(0, os.SEEK_SET) | |
| 1454 | 1454 | |
| 1455 | - finally: | |
| 1456 | - if stream is not None: | |
| 1457 | - log.debug('closing stream') | |
| 1458 | - stream.close() | |
| 1455 | + pattern = obj_type.generate_pattern() | |
| 1459 | 1456 | |
| 1457 | + # try parse | |
| 1458 | + for idx in self.search_pattern(pattern): | |
| 1459 | + # assume a ExternalObjectStorage in stream at idx | |
| 1460 | + stream.seek(idx) | |
| 1461 | + log.debug('extracting at idx {}'.format(idx)) | |
| 1462 | + try: | |
| 1463 | + storage = obj_type.extract_from(stream) | |
| 1464 | + except Exception: | |
| 1465 | + self._log_exception() | |
| 1466 | + continue | |
| 1467 | + | |
| 1468 | + errs = storage.check_validity() | |
| 1469 | + if errs: | |
| 1470 | + log.warning('check_validity found {} issues' | |
| 1471 | + .format(len(errs))) | |
| 1472 | + else: | |
| 1473 | + log.debug('storage is ok; compressed={}, size={}, ' | |
| 1474 | + 'size_decomp={}' | |
| 1475 | + .format(storage.is_compressed, | |
| 1476 | + storage.rec_head.rec_len, | |
| 1477 | + storage.uncompressed_size)) | |
| 1478 | + yield storage | |
| 1479 | + for err in errs: | |
| 1480 | + log.warning('check_validity({}): {}' | |
| 1481 | + .format(obj_type.__name__, err)) | |
| 1482 | + if errs and self.fast_fail: | |
| 1483 | + raise errs[0] | |
| 1460 | 1484 | |
| 1461 | - def decompress_vba_storage(self, storage): | |
| 1485 | + @with_opened_main_stream | |
| 1486 | + def decompress_vba_storage(self, stream, storage): | |
| 1462 | 1487 | """ return decompressed data from search_vba_storage """ |
| 1463 | 1488 | |
| 1464 | 1489 | log.debug('decompressing storage for VBA OLE data stream ') |
| 1465 | - stream = None | |
| 1466 | - try: | |
| 1467 | - log.debug('opening stream') | |
| 1468 | - stream = self.ole.openstream(MAIN_STREAM_NAME) | |
| 1469 | - | |
| 1470 | - # decompress iteratively; a zlib.decompress of all data | |
| 1471 | - # failed with Error -5 (incomplete or truncated stream) | |
| 1472 | - stream.seek(storage.data_offset, os.SEEK_SET) | |
| 1473 | - decomp, n_read, err = \ | |
| 1474 | - iterative_decompress(stream, storage.data_size) | |
| 1475 | - log.debug('decompressed {} to {} bytes; found err: {}' | |
| 1476 | - .format(n_read, len(decomp), err)) | |
| 1477 | - if err and self.fast_fail: | |
| 1478 | - raise err | |
| 1479 | - # otherwise try to continue with partial data | |
| 1480 | - | |
| 1481 | - return decomp | |
| 1482 | - | |
| 1483 | - ## create OleFileIO from decompressed data | |
| 1484 | - #ole = olefile.OleFileIO(decomp) | |
| 1485 | - #root_streams = [entry[0].lower() for entry in ole.listdir()] | |
| 1486 | - #for required in 'project', 'projectwm', 'vba': | |
| 1487 | - # if required not in root_streams: | |
| 1488 | - # raise ValueError('storage seems to not be a VBA storage ' | |
| 1489 | - # '({} not found in root streams)' | |
| 1490 | - # .format(required)) | |
| 1491 | - #log.debug('tests succeeded') | |
| 1492 | - #return ole | |
| 1493 | - | |
| 1494 | - finally: | |
| 1495 | - if stream is not None: | |
| 1496 | - log.debug('closing stream') | |
| 1497 | - stream.close() | |
| 1498 | 1490 | |
| 1499 | - | |
| 1500 | - def read_vba_storage_data(self, storage): | |
| 1491 | + # decompress iteratively; a zlib.decompress of all data | |
| 1492 | + # failed with Error -5 (incomplete or truncated stream) | |
| 1493 | + stream.seek(storage.data_offset, os.SEEK_SET) | |
| 1494 | + decomp, n_read, err = \ | |
| 1495 | + iterative_decompress(stream, storage.data_size) | |
| 1496 | + log.debug('decompressed {} to {} bytes; found err: {}' | |
| 1497 | + .format(n_read, len(decomp), err)) | |
| 1498 | + if err and self.fast_fail: | |
| 1499 | + raise err | |
| 1500 | + # otherwise try to continue with partial data | |
| 1501 | + | |
| 1502 | + return decomp | |
| 1503 | + | |
| 1504 | + ## create OleFileIO from decompressed data | |
| 1505 | + #ole = olefile.OleFileIO(decomp) | |
| 1506 | + #root_streams = [entry[0].lower() for entry in ole.listdir()] | |
| 1507 | + #for required in 'project', 'projectwm', 'vba': | |
| 1508 | + # if required not in root_streams: | |
| 1509 | + # raise ValueError('storage seems to not be a VBA storage ' | |
| 1510 | + # '({} not found in root streams)' | |
| 1511 | + # .format(required)) | |
| 1512 | + #log.debug('tests succeeded') | |
| 1513 | + #return ole | |
| 1514 | + | |
| 1515 | + @with_opened_main_stream | |
| 1516 | + def read_vba_storage_data(self, stream, storage): | |
| 1501 | 1517 | """ return data pointed to by uncompressed storage """ |
| 1502 | 1518 | |
| 1503 | - log.debug('reading uncompressed VBA OLE data stream') | |
| 1504 | - stream = None | |
| 1505 | - try: | |
| 1506 | - log.debug('opening stream') | |
| 1507 | - stream = self.ole.openstream(MAIN_STREAM_NAME) | |
| 1508 | - | |
| 1509 | - log.debug('reading {} bytes starting at {}' | |
| 1510 | - .format(storage.data_size, storage.data_offset)) | |
| 1511 | - stream.seek(storage.data_offset, os.SEEK_SET) | |
| 1512 | - data = stream.read(storage.data_size) | |
| 1513 | - | |
| 1514 | - return data | |
| 1519 | + log.debug('reading uncompressed VBA OLE data stream: ' | |
| 1520 | + '{} bytes starting at {}' | |
| 1521 | + .format(storage.data_size, storage.data_offset)) | |
| 1522 | + stream.seek(storage.data_offset, os.SEEK_SET) | |
| 1523 | + data = stream.read(storage.data_size) | |
| 1524 | + return data | |
| 1525 | + | |
| 1526 | + @generator_with_opened_main_stream | |
| 1527 | + def iter_vba_data(self, stream): | |
| 1528 | + """ search vba infos and storages, yield uncompressed storage data """ | |
| 1529 | + | |
| 1530 | + n_infos = 0 | |
| 1531 | + n_macros = 0 | |
| 1532 | + for info in self.search_vba_info(): | |
| 1533 | + n_infos += 1 | |
| 1534 | + if info.vba_info_atom.f_has_macros > 0: | |
| 1535 | + n_macros += 1 | |
| 1536 | + # TODO: does it make sense at all to continue if n_macros == 0? | |
| 1537 | + # --> no vba-info, so all storages probably ActiveX or other OLE | |
| 1538 | + n_storages = 0 | |
| 1539 | + n_compressed = 0 | |
| 1540 | + for storage in self.search_vba_storage(): | |
| 1541 | + n_storages += 1 | |
| 1542 | + if storage.is_compressed: | |
| 1543 | + n_compressed += 1 | |
| 1544 | + yield self.decompress_vba_storage(storage) | |
| 1545 | + else: | |
| 1546 | + yield self.read_vba_storage_data(storage) | |
| 1515 | 1547 | |
| 1516 | - finally: | |
| 1517 | - if stream is not None: | |
| 1518 | - log.debug('closing stream') | |
| 1519 | - stream.close() | |
| 1548 | + log.info('found {} infos ({} with macros) and {} storages ' | |
| 1549 | + '({} compressed)' | |
| 1550 | + .format(n_infos, n_macros, n_storages, n_compressed)) | |
| 1520 | 1551 | |
| 1521 | 1552 | |
| 1522 | 1553 | def iterative_decompress(stream, size, chunk_size=4096): |
| ... | ... | @@ -1559,16 +1590,9 @@ def test(): |
| 1559 | 1590 | try: |
| 1560 | 1591 | ppt = PptParser(file_name, fast_fail=False) |
| 1561 | 1592 | #ppt.parse_document_persist_object() |
| 1562 | - n_infos = len(ppt.search_vba_info()) | |
| 1563 | - storages = ppt.search_vba_storage() | |
| 1564 | - n_storages = len(storages) | |
| 1565 | - log.debug('found {} infos and {} storages'.format(n_infos, | |
| 1566 | - n_storages)) | |
| 1567 | - if n_infos != n_storages: | |
| 1568 | - log.warning('found different number of vba infos and storages') | |
| 1569 | - for storage in storages: | |
| 1570 | - parser = VBA_Parser(None, ppt.decompress_vba_storage(storage), | |
| 1571 | - container='PptParser') | |
| 1593 | + | |
| 1594 | + for vba_data in ppt.iter_vba_data(): | |
| 1595 | + parser = VBA_Parser(None, vba_data, container='PptParser') | |
| 1572 | 1596 | for vba_root, project_path, dir_path in \ |
| 1573 | 1597 | parser.find_vba_projects(): |
| 1574 | 1598 | log.info('found vba project: root={}, proj={}, dir={}' | ... | ... |