Commit 05a27a43643562e52dbc3710e7e3dd044adf872f

Authored by Christian Herdtweck
1 parent 8ee20161

managed to extract vba stream from ppt by byte-search for ExternalObjectStorage

Qapla'
Showing 1 changed file with 300 additions and 43 deletions
oletools/ppt_parser.py
@@ -19,7 +19,8 @@ References: @@ -19,7 +19,8 @@ References:
19 # - make stream optional in PptUnexpectedData 19 # - make stream optional in PptUnexpectedData
20 # - can speed-up by using less bigger struct.parse calls? 20 # - can speed-up by using less bigger struct.parse calls?
21 # - license 21 # - license
22 -# - create a AtomBase class that defines check_value and parses RecordHead? 22 +# - make buffered stream from output of iterative_decompress
  23 +# - less stream open/close, possibly through decorator for open+closing?
23 # 24 #
24 # CHANGELOG: 25 # CHANGELOG:
25 # 2016-05-04 v0.01 CH: - start parsing "Current User" stream 26 # 2016-05-04 v0.01 CH: - start parsing "Current User" stream
@@ -33,9 +34,11 @@ import logging @@ -33,9 +34,11 @@ import logging
33 import struct 34 import struct
34 import traceback 35 import traceback
35 import os 36 import os
  37 +import cStringIO
36 38
37 import thirdparty.olefile as olefile 39 import thirdparty.olefile as olefile
38 from olevba import get_logger 40 from olevba import get_logger
  41 +import zlib
39 42
40 43
41 # a global logger object used for debugging: 44 # a global logger object used for debugging:
@@ -132,6 +135,8 @@ class RecordHeader(object): @@ -132,6 +135,8 @@ class RecordHeader(object):
132 135
133 length of result depends on rec_len being given or not 136 length of result depends on rec_len being given or not
134 """ 137 """
  138 + if rec_type is None:
  139 + raise ValueError('RECORD_TYPE not set!')
135 version_instance = rec_ver + 2**4 * rec_instance 140 version_instance = rec_ver + 2**4 * rec_instance
136 if rec_len is None: 141 if rec_len is None:
137 return struct.pack('<HH', version_instance, rec_type) 142 return struct.pack('<HH', version_instance, rec_type)
@@ -161,7 +166,12 @@ class PptType(object): @@ -161,7 +166,12 @@ class PptType(object):
161 self.rec_head = RecordHeader.extract_from(stream) 166 self.rec_head = RecordHeader.extract_from(stream)
162 167
163 def check_validity(self): 168 def check_validity(self):
164 - """ to be overwritten in subclasses 169 + """ check validity of data
  170 +
  171 + replaces 'raise PptUnexpectedData' so caller can get all the errors
  172 + (not just the first) whenever she wishes.
  173 +
  174 + to be overwritten in subclasses
165 175
166 :returns: list of PptUnexpectedData 176 :returns: list of PptUnexpectedData
167 """ 177 """
@@ -243,6 +253,12 @@ class PptType(object): @@ -243,6 +253,12 @@ class PptType(object):
243 self.rec_head.rec_len, length)) 253 self.rec_head.rec_len, length))
244 return errs 254 return errs
245 255
  256 + @classmethod
  257 + def generate_pattern(clz, rec_len=None):
  258 + """ call RecordHeader.generate with values for this type """
  259 + return RecordHeader.generate(clz.RECORD_TYPE, rec_len,
  260 + clz.RECORD_INSTANCE, clz.RECORD_VERSION)
  261 +
246 262
247 class CurrentUserAtom(PptType): 263 class CurrentUserAtom(PptType):
248 """ An atom record that specifies information about the last user to modify 264 """ An atom record that specifies information about the last user to modify
@@ -853,7 +869,7 @@ class VBAInfoContainer(PptType): @@ -853,7 +869,7 @@ class VBAInfoContainer(PptType):
853 if rec_head is None: 869 if rec_head is None:
854 obj.read_rec_head(stream) 870 obj.read_rec_head(stream)
855 else: 871 else:
856 - log.debug('skip parsing of RecordHead') 872 + log.debug('skip parsing of RecordHeader')
857 obj.rec_head = rec_head 873 obj.rec_head = rec_head
858 obj.vba_info_atom = VBAInfoAtom.extract_from(stream) 874 obj.vba_info_atom = VBAInfoAtom.extract_from(stream)
859 return obj 875 return obj
@@ -912,6 +928,92 @@ class VBAInfoAtom(PptType): @@ -912,6 +928,92 @@ class VBAInfoAtom(PptType):
912 errs.extend(self.check_value('version', self.version, 2)) 928 errs.extend(self.check_value('version', self.version, 2))
913 return errs 929 return errs
914 930
  931 +
  932 +class ExternalObjectStorage(PptType):
  933 + """ storage for compressed/uncompressed OLE/VBA/ActiveX control data
  934 +
  935 + Matches types ExOleObjStgCompressedAtom, ExOleObjStgUncompressedAtom,
  936 + VbaProjectStgCompressedAtom, VbaProjectStgUncompressedAtom,
  937 + ExControlStgUncompressedAtom, ExControlStgCompressedAtom
  938 +
  939 + Difference between compressed and uncompressed: RecordHeader.rec_instance
  940 + is 0 or 1, first variable after RecordHeader is decompressed_size
  941 +
  942 + Data is not read at first, only its offset in the stream and size is saved
  943 +
  944 + e.g.
  945 + https://msdn.microsoft.com/en-us/library/dd952169%28v=office.12%29.aspx
  946 + """
  947 +
  948 + RECORD_TYPE = 0x1011
  949 + RECORD_INSTANCE_COMPRESSED = 1
  950 + RECORD_INSTANCE_UNCOMPRESSED = 0
  951 +
  952 + def __init__(self, compressed=None):
  953 + super(ExternalObjectStorage, self).__init__()
  954 + if compressed is None:
  955 + self.RECORD_INSTANCE = None # otherwise defaults to 0
  956 + elif compressed:
  957 + self.RECORD_INSTANCE = self.RECORD_INSTANCE_COMPRESSED
  958 + self.compressed = True
  959 + else:
  960 + self.RECORD_INSTANCE = self.RECORD_INSTANCE_UNCOMPRESSED
  961 + self.compressed = False
  962 + self.uncompressed_size = None
  963 + self.data_offset = None
  964 + self.data_size = None
  965 +
  966 + def extract_from(self, stream):
  967 + """ not a classmethod because of compressed attrib
  968 +
  969 + see also: DummyType
  970 + """
  971 + log.debug('Parsing ExternalObjectStorage (compressed={}) from stream'
  972 + .format(self.compressed))
  973 + self.read_rec_head(stream)
  974 + self.data_size = self.rec_head.rec_len
  975 + if self.compressed:
  976 + log.debug('is compressed --> reduce size')
  977 + self.uncompressed_size = read_4(stream)
  978 + self.data_size -= 4
  979 + self.data_offset = stream.tell()
  980 +
  981 + def check_validity(self):
  982 + return self.check_rec_head()
  983 +
  984 +
  985 +class ExternalObjectStorageUncompressed(ExternalObjectStorage):
  986 + """ subclass of ExternalObjectStorage for uncompressed objects """
  987 + RECORD_INSTANCE = ExternalObjectStorage.RECORD_INSTANCE_UNCOMPRESSED
  988 +
  989 + def __init__(self):
  990 + super(ExternalObjectStorageUncompressed, self).__init__(False)
  991 +
  992 + @classmethod
  993 + def extract_from(clz, stream):
  994 + """ note the usage of super here: call instance method of super class!
  995 + """
  996 + obj = clz()
  997 + super(ExternalObjectStorageUncompressed, obj).extract_from(stream)
  998 + return obj
  999 +
  1000 +
  1001 +class ExternalObjectStorageCompressed(ExternalObjectStorage):
  1002 + """ subclass of ExternalObjectStorage for compressed objects """
  1003 + RECORD_INSTANCE = ExternalObjectStorage.RECORD_INSTANCE_COMPRESSED
  1004 +
  1005 + def __init__(self):
  1006 + super(ExternalObjectStorageCompressed, self).__init__(True)
  1007 +
  1008 + @classmethod
  1009 + def extract_from(clz, stream):
  1010 + """ note the usage of super here: call instance method of super class!
  1011 + """
  1012 + obj = clz()
  1013 + super(ExternalObjectStorageCompressed, obj).extract_from(stream)
  1014 + return obj
  1015 +
  1016 +
915 # === PptParser =============================================================== 1017 # === PptParser ===============================================================
916 1018
917 1019
@@ -1180,59 +1282,65 @@ class PptParser(object): @@ -1180,59 +1282,65 @@ class PptParser(object):
1180 if errs and self.fast_fail: 1282 if errs and self.fast_fail:
1181 raise errs[0] 1283 raise errs[0]
1182 1284
1183 - def search_vba(self):  
1184 - """ quick-and-dirty: do not parse everything, just look for right bytes  
1185 -  
1186 - "quick" here means quick to program. Runtime now is linear is document  
1187 - size (--> for big documents the other method might be faster)  
1188 - """ 1285 + def search_pattern(self, pattern, stream):
  1286 + """ search for pattern in stream, return indices """
1189 1287
1190 BUF_SIZE = 1024 1288 BUF_SIZE = 1024
1191 1289
1192 - pattern = RecordHeader.generate(  
1193 - VBAInfoContainer.RECORD_TYPE,  
1194 - rec_len=VBAInfoContainer.RECORD_LENGTH,  
1195 - rec_instance=VBAInfoContainer.RECORD_INSTANCE,  
1196 - rec_ver=VBAInfoContainer.RECORD_VERSION) \  
1197 - + RecordHeader.generate(  
1198 - VBAInfoAtom.RECORD_TYPE,  
1199 - rec_len=VBAInfoAtom.RECORD_LENGTH,  
1200 - rec_instance=VBAInfoAtom.RECORD_INSTANCE,  
1201 - rec_ver=VBAInfoAtom.RECORD_VERSION)  
1202 pattern_len = len(pattern) 1290 pattern_len = len(pattern)
1203 log.debug('pattern length is {}'.format(pattern_len)) 1291 log.debug('pattern length is {}'.format(pattern_len))
1204 if pattern_len > BUF_SIZE: 1292 if pattern_len > BUF_SIZE:
1205 raise ValueError('need buf > pattern to search!') 1293 raise ValueError('need buf > pattern to search!')
1206 1294
  1295 + n_reads = 0
  1296 + candidates = []
  1297 + while True:
  1298 + start_pos = stream.tell()
  1299 + n_reads += 1
  1300 + #log.debug('read {} starting from {}'
  1301 + # .format(BUF_SIZE, start_pos))
  1302 + buf = stream.read(BUF_SIZE)
  1303 + idx = buf.find(pattern)
  1304 + while idx != -1:
  1305 + log.info('found pattern at index {}'.format(start_pos+idx))
  1306 + candidates.append(start_pos+idx)
  1307 + idx = buf.find(pattern, idx+1)
  1308 +
  1309 + if len(buf) == BUF_SIZE:
  1310 + # move back a bit to avoid splitting of pattern through buf
  1311 + stream.seek(-1 * pattern_len, os.SEEK_CUR)
  1312 + else:
  1313 + log.debug('reached end of buf (read {}<{}) after {} reads'
  1314 + .format(len(buf), BUF_SIZE, n_reads))
  1315 + break
  1316 + return candidates
  1317 +
  1318 +
  1319 + def search_vba_info(self):
  1320 + """ search through stream for VBAInfoContainer, alternative to parse...
  1321 +
  1322 + quick-and-dirty: do not parse everything, just look for right bytes
  1323 +
  1324 + "quick" here means quick to program. Runtime now is linear is document
  1325 + size (--> for big documents the other method might be faster)
  1326 +
  1327 + .. seealso:: search_vba_storage
  1328 + """
  1329 +
  1330 + pattern = VBAInfoContainer.generate_pattern(
  1331 + rec_len=VBAInfoContainer.RECORD_LENGTH) \
  1332 + + VBAInfoAtom.generate_pattern(
  1333 + rec_len=VBAInfoAtom.RECORD_LENGTH)
1207 stream = None 1334 stream = None
1208 try: 1335 try:
1209 log.debug('opening stream') 1336 log.debug('opening stream')
1210 stream = self.ole.openstream(MAIN_STREAM_NAME) 1337 stream = self.ole.openstream(MAIN_STREAM_NAME)
1211 1338
1212 # look for candidate positions 1339 # look for candidate positions
1213 - n_reads = 0  
1214 - candidates = []  
1215 - while True:  
1216 - start_pos = stream.tell()  
1217 - n_reads += 1  
1218 - #log.debug('read {} starting from {}'  
1219 - # .format(BUF_SIZE, start_pos))  
1220 - buf = stream.read(BUF_SIZE)  
1221 - idx = buf.find(pattern)  
1222 - while idx != -1:  
1223 - log.info('found pattern at index {}'.format(start_pos+idx))  
1224 - candidates.append(start_pos+idx)  
1225 - idx = buf.find(pattern, idx+1)  
1226 -  
1227 - if len(buf) == BUF_SIZE:  
1228 - # move back a bit to avoid splitting of pattern through buf  
1229 - stream.seek(-1 * pattern_len, os.SEEK_CUR)  
1230 - else:  
1231 - log.debug('reached end of buf (read {}<{}) after {} reads'  
1232 - .format(len(buf), BUF_SIZE, n_reads))  
1233 - break 1340 + candidates = self.search_pattern(pattern, stream)
1234 1341
1235 # try parse 1342 # try parse
  1343 + containers = []
1236 for idx in candidates: 1344 for idx in candidates:
1237 # assume that in stream at idx there is a VBAInfoContainer 1345 # assume that in stream at idx there is a VBAInfoContainer
1238 stream.seek(idx) 1346 stream.seek(idx)
@@ -1252,23 +1360,149 @@ class PptParser(object): @@ -1252,23 +1360,149 @@ class PptParser(object):
1252 log.info('persist id ref is {}, has_macros {}, version {}' 1360 log.info('persist id ref is {}, has_macros {}, version {}'
1253 .format(atom.persist_id_ref, atom.f_has_macros, 1361 .format(atom.persist_id_ref, atom.f_has_macros,
1254 atom.version)) 1362 atom.version))
  1363 + containers.append(container)
1255 for err in errs: 1364 for err in errs:
1256 log.warning('check_validity(VBAInfoContainer): {}' 1365 log.warning('check_validity(VBAInfoContainer): {}'
1257 .format(err)) 1366 .format(err))
1258 if errs and self.fast_fail: 1367 if errs and self.fast_fail:
1259 raise errs[0] 1368 raise errs[0]
1260 1369
  1370 + return containers
  1371 +
1261 finally: 1372 finally:
1262 if stream is not None: 1373 if stream is not None:
1263 log.debug('closing stream') 1374 log.debug('closing stream')
1264 stream.close() 1375 stream.close()
1265 1376
  1377 + def search_vba_storage(self):
  1378 + """ search through stream for VBAProjectStg, alternative to parse...
  1379 +
  1380 + quick-and-dirty: do not parse everything, just look for right bytes
  1381 +
  1382 + "quick" here means quick to program. Runtime now is linear is document
  1383 + size (--> for big documents the other method might be faster)
  1384 +
  1385 + The storages found could also contain (instead of VBA data): ActiveX
  1386 + data or general OLE data
  1387 +
  1388 + .. seealso:: :py:meth:`search_vba_info`
  1389 + """
  1390 +
  1391 + stream = None
  1392 + try:
  1393 + log.debug('opening stream')
  1394 + stream = self.ole.openstream(MAIN_STREAM_NAME)
  1395 +
  1396 + storages = []
  1397 + for obj_type in (ExternalObjectStorageUncompressed,
  1398 + ExternalObjectStorageCompressed):
  1399 + # re-position stream at start
  1400 + stream.seek(0, os.SEEK_SET)
  1401 +
  1402 + # look for candidate positions
  1403 + pattern = obj_type.generate_pattern()
  1404 + candidates = self.search_pattern(pattern, stream)
  1405 +
  1406 + # try parse
  1407 + for idx in candidates:
  1408 + # assume a ExternalObjectStorage in stream at idx
  1409 + stream.seek(idx)
  1410 + log.info('extracting at idx {}'.format(idx))
  1411 + try:
  1412 + storage = obj_type.extract_from(stream)
  1413 + except Exception:
  1414 + self._log_exception()
  1415 + continue
  1416 +
  1417 + errs = storage.check_validity()
  1418 + if errs:
  1419 + log.warning('check_validity found {} issues'
  1420 + .format(len(errs)))
  1421 + else:
  1422 + log.info('storage is ok; compressed={}, size={}, '
  1423 + 'size_decomp={}'
  1424 + .format(storage.compressed,
  1425 + storage.rec_head.rec_len,
  1426 + storage.uncompressed_size))
  1427 + storages.append(storage)
  1428 + for err in errs:
  1429 + log.warning('check_validity({}): {}'
  1430 + .format(obj_type.__name__, err))
  1431 + if errs and self.fast_fail:
  1432 + raise errs[0]
  1433 +
  1434 + return storages
  1435 +
  1436 + finally:
  1437 + if stream is not None:
  1438 + log.debug('closing stream')
  1439 + stream.close()
  1440 +
  1441 +
  1442 + def decompress_vba_storage(self, storage):
  1443 + """ return decompressed data from search_vba_storage """
  1444 +
  1445 + log.debug('decompressing storage for VBA OLE data stream ')
  1446 + stream = None
  1447 + try:
  1448 + log.debug('opening stream')
  1449 + stream = self.ole.openstream(MAIN_STREAM_NAME)
  1450 +
  1451 + # decompress iteratively; a zlib.decompress of all data
  1452 + # failed with Error -5 (incomplete or truncated stream)
  1453 + stream.seek(storage.data_offset, os.SEEK_SET)
  1454 + decomp, n_read, err = \
  1455 + iterative_decompress(stream, storage.data_size)
  1456 + log.info('decompressed {} to {} bytes, err is {}'
  1457 + .format(n_read, len(decomp), err))
  1458 + if err and self.fast_fail:
  1459 + raise err
  1460 + # otherwise try to continue with partial data
  1461 +
  1462 + return decomp
  1463 +
  1464 + ## create OleFileIO from decompressed data
  1465 + #ole = olefile.OleFileIO(decomp)
  1466 + #root_streams = [entry[0].lower() for entry in ole.listdir()]
  1467 + #for required in 'project', 'projectwm', 'vba':
  1468 + # if required not in root_streams:
  1469 + # raise ValueError('storage seems to not be a VBA storage '
  1470 + # '({} not found in root streams)'
  1471 + # .format(required))
  1472 + #log.debug('tests succeeded')
  1473 + #return ole
  1474 +
  1475 + finally:
  1476 + if stream is not None:
  1477 + log.debug('closing stream')
  1478 + stream.close()
  1479 +
  1480 +
  1481 +def iterative_decompress(stream, size, chunk_size=4096):
  1482 + """ decompress data from stream chunk-wise """
  1483 +
  1484 + decompressor = zlib.decompressobj()
  1485 + n_read = 0
  1486 + decomp = ''
  1487 + return_err = None
  1488 +
  1489 + try:
  1490 + while n_read < size:
  1491 + n_new = min(size-n_read, chunk_size)
  1492 + decomp += decompressor.decompress(stream.read(n_new))
  1493 + n_read += n_new
  1494 + except zlib.error as err:
  1495 + return_err = err
  1496 +
  1497 + return decomp, n_read, return_err
  1498 +
1266 # === TESTING ================================================================= 1499 # === TESTING =================================================================
1267 1500
1268 def test(): 1501 def test():
1269 """ for testing and debugging """ 1502 """ for testing and debugging """
1270 1503
1271 from glob import glob 1504 from glob import glob
  1505 + from olevba import VBA_Parser
1272 1506
1273 # setup logging 1507 # setup logging
1274 logging.basicConfig(level=logging.DEBUG, 1508 logging.basicConfig(level=logging.DEBUG,
@@ -1280,9 +1514,32 @@ def test(): @@ -1280,9 +1514,32 @@ def test():
1280 # parse 1514 # parse
1281 log.info('-' * 72) 1515 log.info('-' * 72)
1282 log.info('test file: {}'.format(file_name)) 1516 log.info('test file: {}'.format(file_name))
1283 - ppt = PptParser(file_name, fast_fail=False)  
1284 - #ppt.parse_document_persist_object()  
1285 - ppt.search_vba() 1517 + try:
  1518 + ppt = PptParser(file_name, fast_fail=False)
  1519 + #ppt.parse_document_persist_object()
  1520 + n_infos = len(ppt.search_vba_info())
  1521 + storages = ppt.search_vba_storage()
  1522 + n_storages = len(storages)
  1523 + log.debug('found {} infos and {} storages'.format(n_infos,
  1524 + n_storages))
  1525 + if n_infos != n_storages:
  1526 + log.warning('found different number of vba infos and storages')
  1527 + for storage in storages:
  1528 + parser = VBA_Parser(None, ppt.decompress_vba_storage(storage),
  1529 + container=file_name)
  1530 + for vba_root, project_path, dir_path in parser.find_vba_projects():
  1531 + log.info('found vba project: root={}, proj={}, dir={}'
  1532 + .format(vba_root, project_path, dir_path))
  1533 + for subfilename, stream_path, vba_filename, vba_code in \
  1534 + parser.extract_all_macros():
  1535 + log.info('found macro: subfile={}, stream={}, vbafile={}'
  1536 + .format(subfilename, stream_path, vba_filename))
  1537 + for line in vba_code.splitlines():
  1538 + log.info('code: {}'.format(line.rstrip()))
  1539 +
  1540 +
  1541 + except Exception:
  1542 + log.exception('exception')
1286 1543
1287 1544
1288 if __name__ == '__main__': 1545 if __name__ == '__main__':