Commit 05a27a43643562e52dbc3710e7e3dd044adf872f
1 parent
8ee20161
managed to extract vba stream from ppt by byte-search for ExternalObjectStorage
Qapla'
Showing
1 changed file
with
300 additions
and
43 deletions
oletools/ppt_parser.py
| ... | ... | @@ -19,7 +19,8 @@ References: |
| 19 | 19 | # - make stream optional in PptUnexpectedData |
| 20 | 20 | # - can speed-up by using less bigger struct.parse calls? |
| 21 | 21 | # - license |
| 22 | -# - create a AtomBase class that defines check_value and parses RecordHead? | |
| 22 | +# - make buffered stream from output of iterative_decompress | |
| 23 | +# - less stream open/close, possibly through decorator for open+closing? | |
| 23 | 24 | # |
| 24 | 25 | # CHANGELOG: |
| 25 | 26 | # 2016-05-04 v0.01 CH: - start parsing "Current User" stream |
| ... | ... | @@ -33,9 +34,11 @@ import logging |
| 33 | 34 | import struct |
| 34 | 35 | import traceback |
| 35 | 36 | import os |
| 37 | +import cStringIO | |
| 36 | 38 | |
| 37 | 39 | import thirdparty.olefile as olefile |
| 38 | 40 | from olevba import get_logger |
| 41 | +import zlib | |
| 39 | 42 | |
| 40 | 43 | |
| 41 | 44 | # a global logger object used for debugging: |
| ... | ... | @@ -132,6 +135,8 @@ class RecordHeader(object): |
| 132 | 135 | |
| 133 | 136 | length of result depends on rec_len being given or not |
| 134 | 137 | """ |
| 138 | + if rec_type is None: | |
| 139 | + raise ValueError('RECORD_TYPE not set!') | |
| 135 | 140 | version_instance = rec_ver + 2**4 * rec_instance |
| 136 | 141 | if rec_len is None: |
| 137 | 142 | return struct.pack('<HH', version_instance, rec_type) |
| ... | ... | @@ -161,7 +166,12 @@ class PptType(object): |
| 161 | 166 | self.rec_head = RecordHeader.extract_from(stream) |
| 162 | 167 | |
| 163 | 168 | def check_validity(self): |
| 164 | - """ to be overwritten in subclasses | |
| 169 | + """ check validity of data | |
| 170 | + | |
| 171 | + replaces 'raise PptUnexpectedData' so caller can get all the errors | |
| 172 | + (not just the first) whenever she wishes. | |
| 173 | + | |
| 174 | + to be overwritten in subclasses | |
| 165 | 175 | |
| 166 | 176 | :returns: list of PptUnexpectedData |
| 167 | 177 | """ |
| ... | ... | @@ -243,6 +253,12 @@ class PptType(object): |
| 243 | 253 | self.rec_head.rec_len, length)) |
| 244 | 254 | return errs |
| 245 | 255 | |
| 256 | + @classmethod | |
| 257 | + def generate_pattern(clz, rec_len=None): | |
| 258 | + """ call RecordHeader.generate with values for this type """ | |
| 259 | + return RecordHeader.generate(clz.RECORD_TYPE, rec_len, | |
| 260 | + clz.RECORD_INSTANCE, clz.RECORD_VERSION) | |
| 261 | + | |
| 246 | 262 | |
| 247 | 263 | class CurrentUserAtom(PptType): |
| 248 | 264 | """ An atom record that specifies information about the last user to modify |
| ... | ... | @@ -853,7 +869,7 @@ class VBAInfoContainer(PptType): |
| 853 | 869 | if rec_head is None: |
| 854 | 870 | obj.read_rec_head(stream) |
| 855 | 871 | else: |
| 856 | - log.debug('skip parsing of RecordHead') | |
| 872 | + log.debug('skip parsing of RecordHeader') | |
| 857 | 873 | obj.rec_head = rec_head |
| 858 | 874 | obj.vba_info_atom = VBAInfoAtom.extract_from(stream) |
| 859 | 875 | return obj |
| ... | ... | @@ -912,6 +928,92 @@ class VBAInfoAtom(PptType): |
| 912 | 928 | errs.extend(self.check_value('version', self.version, 2)) |
| 913 | 929 | return errs |
| 914 | 930 | |
| 931 | + | |
| 932 | +class ExternalObjectStorage(PptType): | |
| 933 | + """ storage for compressed/uncompressed OLE/VBA/ActiveX control data | |
| 934 | + | |
| 935 | + Matches types ExOleObjStgCompressedAtom, ExOleObjStgUncompressedAtom, | |
| 936 | + VbaProjectStgCompressedAtom, VbaProjectStgUncompressedAtom, | |
| 937 | + ExControlStgUncompressedAtom, ExControlStgCompressedAtom | |
| 938 | + | |
| 939 | + Difference between compressed and uncompressed: RecordHeader.rec_instance | |
| 940 | + is 0 or 1, first variable after RecordHeader is decompressed_size | |
| 941 | + | |
| 942 | + Data is not read at first, only its offset in the stream and size is saved | |
| 943 | + | |
| 944 | + e.g. | |
| 945 | + https://msdn.microsoft.com/en-us/library/dd952169%28v=office.12%29.aspx | |
| 946 | + """ | |
| 947 | + | |
| 948 | + RECORD_TYPE = 0x1011 | |
| 949 | + RECORD_INSTANCE_COMPRESSED = 1 | |
| 950 | + RECORD_INSTANCE_UNCOMPRESSED = 0 | |
| 951 | + | |
| 952 | + def __init__(self, compressed=None): | |
| 953 | + super(ExternalObjectStorage, self).__init__() | |
| 954 | + if compressed is None: | |
| 955 | + self.RECORD_INSTANCE = None # otherwise defaults to 0 | |
| 956 | + elif compressed: | |
| 957 | + self.RECORD_INSTANCE = self.RECORD_INSTANCE_COMPRESSED | |
| 958 | + self.compressed = True | |
| 959 | + else: | |
| 960 | + self.RECORD_INSTANCE = self.RECORD_INSTANCE_UNCOMPRESSED | |
| 961 | + self.compressed = False | |
| 962 | + self.uncompressed_size = None | |
| 963 | + self.data_offset = None | |
| 964 | + self.data_size = None | |
| 965 | + | |
| 966 | + def extract_from(self, stream): | |
| 967 | + """ not a classmethod because of compressed attrib | |
| 968 | + | |
| 969 | + see also: DummyType | |
| 970 | + """ | |
| 971 | + log.debug('Parsing ExternalObjectStorage (compressed={}) from stream' | |
| 972 | + .format(self.compressed)) | |
| 973 | + self.read_rec_head(stream) | |
| 974 | + self.data_size = self.rec_head.rec_len | |
| 975 | + if self.compressed: | |
| 976 | + log.debug('is compressed --> reduce size') | |
| 977 | + self.uncompressed_size = read_4(stream) | |
| 978 | + self.data_size -= 4 | |
| 979 | + self.data_offset = stream.tell() | |
| 980 | + | |
| 981 | + def check_validity(self): | |
| 982 | + return self.check_rec_head() | |
| 983 | + | |
| 984 | + | |
| 985 | +class ExternalObjectStorageUncompressed(ExternalObjectStorage): | |
| 986 | + """ subclass of ExternalObjectStorage for uncompressed objects """ | |
| 987 | + RECORD_INSTANCE = ExternalObjectStorage.RECORD_INSTANCE_UNCOMPRESSED | |
| 988 | + | |
| 989 | + def __init__(self): | |
| 990 | + super(ExternalObjectStorageUncompressed, self).__init__(False) | |
| 991 | + | |
| 992 | + @classmethod | |
| 993 | + def extract_from(clz, stream): | |
| 994 | + """ note the usage of super here: call instance method of super class! | |
| 995 | + """ | |
| 996 | + obj = clz() | |
| 997 | + super(ExternalObjectStorageUncompressed, obj).extract_from(stream) | |
| 998 | + return obj | |
| 999 | + | |
| 1000 | + | |
| 1001 | +class ExternalObjectStorageCompressed(ExternalObjectStorage): | |
| 1002 | + """ subclass of ExternalObjectStorage for compressed objects """ | |
| 1003 | + RECORD_INSTANCE = ExternalObjectStorage.RECORD_INSTANCE_COMPRESSED | |
| 1004 | + | |
| 1005 | + def __init__(self): | |
| 1006 | + super(ExternalObjectStorageCompressed, self).__init__(True) | |
| 1007 | + | |
| 1008 | + @classmethod | |
| 1009 | + def extract_from(clz, stream): | |
| 1010 | + """ note the usage of super here: call instance method of super class! | |
| 1011 | + """ | |
| 1012 | + obj = clz() | |
| 1013 | + super(ExternalObjectStorageCompressed, obj).extract_from(stream) | |
| 1014 | + return obj | |
| 1015 | + | |
| 1016 | + | |
| 915 | 1017 | # === PptParser =============================================================== |
| 916 | 1018 | |
| 917 | 1019 | |
| ... | ... | @@ -1180,59 +1282,65 @@ class PptParser(object): |
| 1180 | 1282 | if errs and self.fast_fail: |
| 1181 | 1283 | raise errs[0] |
| 1182 | 1284 | |
| 1183 | - def search_vba(self): | |
| 1184 | - """ quick-and-dirty: do not parse everything, just look for right bytes | |
| 1185 | - | |
| 1186 | - "quick" here means quick to program. Runtime now is linear is document | |
| 1187 | - size (--> for big documents the other method might be faster) | |
| 1188 | - """ | |
| 1285 | + def search_pattern(self, pattern, stream): | |
| 1286 | + """ search for pattern in stream, return indices """ | |
| 1189 | 1287 | |
| 1190 | 1288 | BUF_SIZE = 1024 |
| 1191 | 1289 | |
| 1192 | - pattern = RecordHeader.generate( | |
| 1193 | - VBAInfoContainer.RECORD_TYPE, | |
| 1194 | - rec_len=VBAInfoContainer.RECORD_LENGTH, | |
| 1195 | - rec_instance=VBAInfoContainer.RECORD_INSTANCE, | |
| 1196 | - rec_ver=VBAInfoContainer.RECORD_VERSION) \ | |
| 1197 | - + RecordHeader.generate( | |
| 1198 | - VBAInfoAtom.RECORD_TYPE, | |
| 1199 | - rec_len=VBAInfoAtom.RECORD_LENGTH, | |
| 1200 | - rec_instance=VBAInfoAtom.RECORD_INSTANCE, | |
| 1201 | - rec_ver=VBAInfoAtom.RECORD_VERSION) | |
| 1202 | 1290 | pattern_len = len(pattern) |
| 1203 | 1291 | log.debug('pattern length is {}'.format(pattern_len)) |
| 1204 | 1292 | if pattern_len > BUF_SIZE: |
| 1205 | 1293 | raise ValueError('need buf > pattern to search!') |
| 1206 | 1294 | |
| 1295 | + n_reads = 0 | |
| 1296 | + candidates = [] | |
| 1297 | + while True: | |
| 1298 | + start_pos = stream.tell() | |
| 1299 | + n_reads += 1 | |
| 1300 | + #log.debug('read {} starting from {}' | |
| 1301 | + # .format(BUF_SIZE, start_pos)) | |
| 1302 | + buf = stream.read(BUF_SIZE) | |
| 1303 | + idx = buf.find(pattern) | |
| 1304 | + while idx != -1: | |
| 1305 | + log.info('found pattern at index {}'.format(start_pos+idx)) | |
| 1306 | + candidates.append(start_pos+idx) | |
| 1307 | + idx = buf.find(pattern, idx+1) | |
| 1308 | + | |
| 1309 | + if len(buf) == BUF_SIZE: | |
| 1310 | + # move back a bit to avoid splitting of pattern through buf | |
| 1311 | + stream.seek(-1 * pattern_len, os.SEEK_CUR) | |
| 1312 | + else: | |
| 1313 | + log.debug('reached end of buf (read {}<{}) after {} reads' | |
| 1314 | + .format(len(buf), BUF_SIZE, n_reads)) | |
| 1315 | + break | |
| 1316 | + return candidates | |
| 1317 | + | |
| 1318 | + | |
| 1319 | + def search_vba_info(self): | |
| 1320 | + """ search through stream for VBAInfoContainer, alternative to parse... | |
| 1321 | + | |
| 1322 | + quick-and-dirty: do not parse everything, just look for right bytes | |
| 1323 | + | |
| 1324 | + "quick" here means quick to program. Runtime now is linear is document | |
| 1325 | + size (--> for big documents the other method might be faster) | |
| 1326 | + | |
| 1327 | + .. seealso:: search_vba_storage | |
| 1328 | + """ | |
| 1329 | + | |
| 1330 | + pattern = VBAInfoContainer.generate_pattern( | |
| 1331 | + rec_len=VBAInfoContainer.RECORD_LENGTH) \ | |
| 1332 | + + VBAInfoAtom.generate_pattern( | |
| 1333 | + rec_len=VBAInfoAtom.RECORD_LENGTH) | |
| 1207 | 1334 | stream = None |
| 1208 | 1335 | try: |
| 1209 | 1336 | log.debug('opening stream') |
| 1210 | 1337 | stream = self.ole.openstream(MAIN_STREAM_NAME) |
| 1211 | 1338 | |
| 1212 | 1339 | # look for candidate positions |
| 1213 | - n_reads = 0 | |
| 1214 | - candidates = [] | |
| 1215 | - while True: | |
| 1216 | - start_pos = stream.tell() | |
| 1217 | - n_reads += 1 | |
| 1218 | - #log.debug('read {} starting from {}' | |
| 1219 | - # .format(BUF_SIZE, start_pos)) | |
| 1220 | - buf = stream.read(BUF_SIZE) | |
| 1221 | - idx = buf.find(pattern) | |
| 1222 | - while idx != -1: | |
| 1223 | - log.info('found pattern at index {}'.format(start_pos+idx)) | |
| 1224 | - candidates.append(start_pos+idx) | |
| 1225 | - idx = buf.find(pattern, idx+1) | |
| 1226 | - | |
| 1227 | - if len(buf) == BUF_SIZE: | |
| 1228 | - # move back a bit to avoid splitting of pattern through buf | |
| 1229 | - stream.seek(-1 * pattern_len, os.SEEK_CUR) | |
| 1230 | - else: | |
| 1231 | - log.debug('reached end of buf (read {}<{}) after {} reads' | |
| 1232 | - .format(len(buf), BUF_SIZE, n_reads)) | |
| 1233 | - break | |
| 1340 | + candidates = self.search_pattern(pattern, stream) | |
| 1234 | 1341 | |
| 1235 | 1342 | # try parse |
| 1343 | + containers = [] | |
| 1236 | 1344 | for idx in candidates: |
| 1237 | 1345 | # assume that in stream at idx there is a VBAInfoContainer |
| 1238 | 1346 | stream.seek(idx) |
| ... | ... | @@ -1252,23 +1360,149 @@ class PptParser(object): |
| 1252 | 1360 | log.info('persist id ref is {}, has_macros {}, version {}' |
| 1253 | 1361 | .format(atom.persist_id_ref, atom.f_has_macros, |
| 1254 | 1362 | atom.version)) |
| 1363 | + containers.append(container) | |
| 1255 | 1364 | for err in errs: |
| 1256 | 1365 | log.warning('check_validity(VBAInfoContainer): {}' |
| 1257 | 1366 | .format(err)) |
| 1258 | 1367 | if errs and self.fast_fail: |
| 1259 | 1368 | raise errs[0] |
| 1260 | 1369 | |
| 1370 | + return containers | |
| 1371 | + | |
| 1261 | 1372 | finally: |
| 1262 | 1373 | if stream is not None: |
| 1263 | 1374 | log.debug('closing stream') |
| 1264 | 1375 | stream.close() |
| 1265 | 1376 | |
| 1377 | + def search_vba_storage(self): | |
| 1378 | + """ search through stream for VBAProjectStg, alternative to parse... | |
| 1379 | + | |
| 1380 | + quick-and-dirty: do not parse everything, just look for right bytes | |
| 1381 | + | |
| 1382 | + "quick" here means quick to program. Runtime now is linear is document | |
| 1383 | + size (--> for big documents the other method might be faster) | |
| 1384 | + | |
| 1385 | + The storages found could also contain (instead of VBA data): ActiveX | |
| 1386 | + data or general OLE data | |
| 1387 | + | |
| 1388 | + .. seealso:: :py:meth:`search_vba_info` | |
| 1389 | + """ | |
| 1390 | + | |
| 1391 | + stream = None | |
| 1392 | + try: | |
| 1393 | + log.debug('opening stream') | |
| 1394 | + stream = self.ole.openstream(MAIN_STREAM_NAME) | |
| 1395 | + | |
| 1396 | + storages = [] | |
| 1397 | + for obj_type in (ExternalObjectStorageUncompressed, | |
| 1398 | + ExternalObjectStorageCompressed): | |
| 1399 | + # re-position stream at start | |
| 1400 | + stream.seek(0, os.SEEK_SET) | |
| 1401 | + | |
| 1402 | + # look for candidate positions | |
| 1403 | + pattern = obj_type.generate_pattern() | |
| 1404 | + candidates = self.search_pattern(pattern, stream) | |
| 1405 | + | |
| 1406 | + # try parse | |
| 1407 | + for idx in candidates: | |
| 1408 | + # assume a ExternalObjectStorage in stream at idx | |
| 1409 | + stream.seek(idx) | |
| 1410 | + log.info('extracting at idx {}'.format(idx)) | |
| 1411 | + try: | |
| 1412 | + storage = obj_type.extract_from(stream) | |
| 1413 | + except Exception: | |
| 1414 | + self._log_exception() | |
| 1415 | + continue | |
| 1416 | + | |
| 1417 | + errs = storage.check_validity() | |
| 1418 | + if errs: | |
| 1419 | + log.warning('check_validity found {} issues' | |
| 1420 | + .format(len(errs))) | |
| 1421 | + else: | |
| 1422 | + log.info('storage is ok; compressed={}, size={}, ' | |
| 1423 | + 'size_decomp={}' | |
| 1424 | + .format(storage.compressed, | |
| 1425 | + storage.rec_head.rec_len, | |
| 1426 | + storage.uncompressed_size)) | |
| 1427 | + storages.append(storage) | |
| 1428 | + for err in errs: | |
| 1429 | + log.warning('check_validity({}): {}' | |
| 1430 | + .format(obj_type.__name__, err)) | |
| 1431 | + if errs and self.fast_fail: | |
| 1432 | + raise errs[0] | |
| 1433 | + | |
| 1434 | + return storages | |
| 1435 | + | |
| 1436 | + finally: | |
| 1437 | + if stream is not None: | |
| 1438 | + log.debug('closing stream') | |
| 1439 | + stream.close() | |
| 1440 | + | |
| 1441 | + | |
| 1442 | + def decompress_vba_storage(self, storage): | |
| 1443 | + """ return decompressed data from search_vba_storage """ | |
| 1444 | + | |
| 1445 | + log.debug('decompressing storage for VBA OLE data stream ') | |
| 1446 | + stream = None | |
| 1447 | + try: | |
| 1448 | + log.debug('opening stream') | |
| 1449 | + stream = self.ole.openstream(MAIN_STREAM_NAME) | |
| 1450 | + | |
| 1451 | + # decompress iteratively; a zlib.decompress of all data | |
| 1452 | + # failed with Error -5 (incomplete or truncated stream) | |
| 1453 | + stream.seek(storage.data_offset, os.SEEK_SET) | |
| 1454 | + decomp, n_read, err = \ | |
| 1455 | + iterative_decompress(stream, storage.data_size) | |
| 1456 | + log.info('decompressed {} to {} bytes, err is {}' | |
| 1457 | + .format(n_read, len(decomp), err)) | |
| 1458 | + if err and self.fast_fail: | |
| 1459 | + raise err | |
| 1460 | + # otherwise try to continue with partial data | |
| 1461 | + | |
| 1462 | + return decomp | |
| 1463 | + | |
| 1464 | + ## create OleFileIO from decompressed data | |
| 1465 | + #ole = olefile.OleFileIO(decomp) | |
| 1466 | + #root_streams = [entry[0].lower() for entry in ole.listdir()] | |
| 1467 | + #for required in 'project', 'projectwm', 'vba': | |
| 1468 | + # if required not in root_streams: | |
| 1469 | + # raise ValueError('storage seems to not be a VBA storage ' | |
| 1470 | + # '({} not found in root streams)' | |
| 1471 | + # .format(required)) | |
| 1472 | + #log.debug('tests succeeded') | |
| 1473 | + #return ole | |
| 1474 | + | |
| 1475 | + finally: | |
| 1476 | + if stream is not None: | |
| 1477 | + log.debug('closing stream') | |
| 1478 | + stream.close() | |
| 1479 | + | |
| 1480 | + | |
| 1481 | +def iterative_decompress(stream, size, chunk_size=4096): | |
| 1482 | + """ decompress data from stream chunk-wise """ | |
| 1483 | + | |
| 1484 | + decompressor = zlib.decompressobj() | |
| 1485 | + n_read = 0 | |
| 1486 | + decomp = '' | |
| 1487 | + return_err = None | |
| 1488 | + | |
| 1489 | + try: | |
| 1490 | + while n_read < size: | |
| 1491 | + n_new = min(size-n_read, chunk_size) | |
| 1492 | + decomp += decompressor.decompress(stream.read(n_new)) | |
| 1493 | + n_read += n_new | |
| 1494 | + except zlib.error as err: | |
| 1495 | + return_err = err | |
| 1496 | + | |
| 1497 | + return decomp, n_read, return_err | |
| 1498 | + | |
| 1266 | 1499 | # === TESTING ================================================================= |
| 1267 | 1500 | |
| 1268 | 1501 | def test(): |
| 1269 | 1502 | """ for testing and debugging """ |
| 1270 | 1503 | |
| 1271 | 1504 | from glob import glob |
| 1505 | + from olevba import VBA_Parser | |
| 1272 | 1506 | |
| 1273 | 1507 | # setup logging |
| 1274 | 1508 | logging.basicConfig(level=logging.DEBUG, |
| ... | ... | @@ -1280,9 +1514,32 @@ def test(): |
| 1280 | 1514 | # parse |
| 1281 | 1515 | log.info('-' * 72) |
| 1282 | 1516 | log.info('test file: {}'.format(file_name)) |
| 1283 | - ppt = PptParser(file_name, fast_fail=False) | |
| 1284 | - #ppt.parse_document_persist_object() | |
| 1285 | - ppt.search_vba() | |
| 1517 | + try: | |
| 1518 | + ppt = PptParser(file_name, fast_fail=False) | |
| 1519 | + #ppt.parse_document_persist_object() | |
| 1520 | + n_infos = len(ppt.search_vba_info()) | |
| 1521 | + storages = ppt.search_vba_storage() | |
| 1522 | + n_storages = len(storages) | |
| 1523 | + log.debug('found {} infos and {} storages'.format(n_infos, | |
| 1524 | + n_storages)) | |
| 1525 | + if n_infos != n_storages: | |
| 1526 | + log.warning('found different number of vba infos and storages') | |
| 1527 | + for storage in storages: | |
| 1528 | + parser = VBA_Parser(None, ppt.decompress_vba_storage(storage), | |
| 1529 | + container=file_name) | |
| 1530 | + for vba_root, project_path, dir_path in parser.find_vba_projects(): | |
| 1531 | + log.info('found vba project: root={}, proj={}, dir={}' | |
| 1532 | + .format(vba_root, project_path, dir_path)) | |
| 1533 | + for subfilename, stream_path, vba_filename, vba_code in \ | |
| 1534 | + parser.extract_all_macros(): | |
| 1535 | + log.info('found macro: subfile={}, stream={}, vbafile={}' | |
| 1536 | + .format(subfilename, stream_path, vba_filename)) | |
| 1537 | + for line in vba_code.splitlines(): | |
| 1538 | + log.info('code: {}'.format(line.rstrip())) | |
| 1539 | + | |
| 1540 | + | |
| 1541 | + except Exception: | |
| 1542 | + log.exception('exception') | |
| 1286 | 1543 | |
| 1287 | 1544 | |
| 1288 | 1545 | if __name__ == '__main__': | ... | ... |