Commit 05a27a43643562e52dbc3710e7e3dd044adf872f

Authored by Christian Herdtweck
1 parent 8ee20161

managed to extract vba stream from ppt by byte-search for ExternalObjectStorage

Qapla'
Showing 1 changed file with 300 additions and 43 deletions
oletools/ppt_parser.py
... ... @@ -19,7 +19,8 @@ References:
19 19 # - make stream optional in PptUnexpectedData
20 20 # - can speed-up by using less bigger struct.parse calls?
21 21 # - license
22   -# - create a AtomBase class that defines check_value and parses RecordHead?
  22 +# - make buffered stream from output of iterative_decompress
  23 +# - less stream open/close, possibly through decorator for open+closing?
23 24 #
24 25 # CHANGELOG:
25 26 # 2016-05-04 v0.01 CH: - start parsing "Current User" stream
... ... @@ -33,9 +34,11 @@ import logging
33 34 import struct
34 35 import traceback
35 36 import os
  37 +import cStringIO
36 38  
37 39 import thirdparty.olefile as olefile
38 40 from olevba import get_logger
  41 +import zlib
39 42  
40 43  
41 44 # a global logger object used for debugging:
... ... @@ -132,6 +135,8 @@ class RecordHeader(object):
132 135  
133 136 length of result depends on rec_len being given or not
134 137 """
  138 + if rec_type is None:
  139 + raise ValueError('RECORD_TYPE not set!')
135 140 version_instance = rec_ver + 2**4 * rec_instance
136 141 if rec_len is None:
137 142 return struct.pack('<HH', version_instance, rec_type)
... ... @@ -161,7 +166,12 @@ class PptType(object):
161 166 self.rec_head = RecordHeader.extract_from(stream)
162 167  
163 168 def check_validity(self):
164   - """ to be overwritten in subclasses
  169 + """ check validity of data
  170 +
  171 + replaces 'raise PptUnexpectedData' so caller can get all the errors
  172 + (not just the first) whenever she wishes.
  173 +
  174 + to be overwritten in subclasses
165 175  
166 176 :returns: list of PptUnexpectedData
167 177 """
... ... @@ -243,6 +253,12 @@ class PptType(object):
243 253 self.rec_head.rec_len, length))
244 254 return errs
245 255  
  256 + @classmethod
  257 + def generate_pattern(clz, rec_len=None):
  258 + """ call RecordHeader.generate with values for this type """
  259 + return RecordHeader.generate(clz.RECORD_TYPE, rec_len,
  260 + clz.RECORD_INSTANCE, clz.RECORD_VERSION)
  261 +
246 262  
247 263 class CurrentUserAtom(PptType):
248 264 """ An atom record that specifies information about the last user to modify
... ... @@ -853,7 +869,7 @@ class VBAInfoContainer(PptType):
853 869 if rec_head is None:
854 870 obj.read_rec_head(stream)
855 871 else:
856   - log.debug('skip parsing of RecordHead')
  872 + log.debug('skip parsing of RecordHeader')
857 873 obj.rec_head = rec_head
858 874 obj.vba_info_atom = VBAInfoAtom.extract_from(stream)
859 875 return obj
... ... @@ -912,6 +928,92 @@ class VBAInfoAtom(PptType):
912 928 errs.extend(self.check_value('version', self.version, 2))
913 929 return errs
914 930  
  931 +
  932 +class ExternalObjectStorage(PptType):
  933 + """ storage for compressed/uncompressed OLE/VBA/ActiveX control data
  934 +
  935 + Matches types ExOleObjStgCompressedAtom, ExOleObjStgUncompressedAtom,
  936 + VbaProjectStgCompressedAtom, VbaProjectStgUncompressedAtom,
  937 + ExControlStgUncompressedAtom, ExControlStgCompressedAtom
  938 +
  939 + Difference between compressed and uncompressed: RecordHeader.rec_instance
  940 + is 0 or 1, first variable after RecordHeader is decompressed_size
  941 +
  942 + Data is not read at first, only its offset in the stream and size is saved
  943 +
  944 + e.g.
  945 + https://msdn.microsoft.com/en-us/library/dd952169%28v=office.12%29.aspx
  946 + """
  947 +
  948 + RECORD_TYPE = 0x1011
  949 + RECORD_INSTANCE_COMPRESSED = 1
  950 + RECORD_INSTANCE_UNCOMPRESSED = 0
  951 +
  952 + def __init__(self, compressed=None):
  953 + super(ExternalObjectStorage, self).__init__()
  954 + if compressed is None:
  955 + self.RECORD_INSTANCE = None # otherwise defaults to 0
  956 + elif compressed:
  957 + self.RECORD_INSTANCE = self.RECORD_INSTANCE_COMPRESSED
  958 + self.compressed = True
  959 + else:
  960 + self.RECORD_INSTANCE = self.RECORD_INSTANCE_UNCOMPRESSED
  961 + self.compressed = False
  962 + self.uncompressed_size = None
  963 + self.data_offset = None
  964 + self.data_size = None
  965 +
  966 + def extract_from(self, stream):
  967 + """ not a classmethod because of compressed attrib
  968 +
  969 + see also: DummyType
  970 + """
  971 + log.debug('Parsing ExternalObjectStorage (compressed={}) from stream'
  972 + .format(self.compressed))
  973 + self.read_rec_head(stream)
  974 + self.data_size = self.rec_head.rec_len
  975 + if self.compressed:
  976 + log.debug('is compressed --> reduce size')
  977 + self.uncompressed_size = read_4(stream)
  978 + self.data_size -= 4
  979 + self.data_offset = stream.tell()
  980 +
  981 + def check_validity(self):
  982 + return self.check_rec_head()
  983 +
  984 +
  985 +class ExternalObjectStorageUncompressed(ExternalObjectStorage):
  986 + """ subclass of ExternalObjectStorage for uncompressed objects """
  987 + RECORD_INSTANCE = ExternalObjectStorage.RECORD_INSTANCE_UNCOMPRESSED
  988 +
  989 + def __init__(self):
  990 + super(ExternalObjectStorageUncompressed, self).__init__(False)
  991 +
  992 + @classmethod
  993 + def extract_from(clz, stream):
  994 + """ note the usage of super here: call instance method of super class!
  995 + """
  996 + obj = clz()
  997 + super(ExternalObjectStorageUncompressed, obj).extract_from(stream)
  998 + return obj
  999 +
  1000 +
  1001 +class ExternalObjectStorageCompressed(ExternalObjectStorage):
  1002 + """ subclass of ExternalObjectStorage for compressed objects """
  1003 + RECORD_INSTANCE = ExternalObjectStorage.RECORD_INSTANCE_COMPRESSED
  1004 +
  1005 + def __init__(self):
  1006 + super(ExternalObjectStorageCompressed, self).__init__(True)
  1007 +
  1008 + @classmethod
  1009 + def extract_from(clz, stream):
  1010 + """ note the usage of super here: call instance method of super class!
  1011 + """
  1012 + obj = clz()
  1013 + super(ExternalObjectStorageCompressed, obj).extract_from(stream)
  1014 + return obj
  1015 +
  1016 +
915 1017 # === PptParser ===============================================================
916 1018  
917 1019  
... ... @@ -1180,59 +1282,65 @@ class PptParser(object):
1180 1282 if errs and self.fast_fail:
1181 1283 raise errs[0]
1182 1284  
1183   - def search_vba(self):
1184   - """ quick-and-dirty: do not parse everything, just look for right bytes
1185   -
1186   - "quick" here means quick to program. Runtime now is linear is document
1187   - size (--> for big documents the other method might be faster)
1188   - """
  1285 + def search_pattern(self, pattern, stream):
  1286 + """ search for pattern in stream, return indices """
1189 1287  
1190 1288 BUF_SIZE = 1024
1191 1289  
1192   - pattern = RecordHeader.generate(
1193   - VBAInfoContainer.RECORD_TYPE,
1194   - rec_len=VBAInfoContainer.RECORD_LENGTH,
1195   - rec_instance=VBAInfoContainer.RECORD_INSTANCE,
1196   - rec_ver=VBAInfoContainer.RECORD_VERSION) \
1197   - + RecordHeader.generate(
1198   - VBAInfoAtom.RECORD_TYPE,
1199   - rec_len=VBAInfoAtom.RECORD_LENGTH,
1200   - rec_instance=VBAInfoAtom.RECORD_INSTANCE,
1201   - rec_ver=VBAInfoAtom.RECORD_VERSION)
1202 1290 pattern_len = len(pattern)
1203 1291 log.debug('pattern length is {}'.format(pattern_len))
1204 1292 if pattern_len > BUF_SIZE:
1205 1293 raise ValueError('need buf > pattern to search!')
1206 1294  
  1295 + n_reads = 0
  1296 + candidates = []
  1297 + while True:
  1298 + start_pos = stream.tell()
  1299 + n_reads += 1
  1300 + #log.debug('read {} starting from {}'
  1301 + # .format(BUF_SIZE, start_pos))
  1302 + buf = stream.read(BUF_SIZE)
  1303 + idx = buf.find(pattern)
  1304 + while idx != -1:
  1305 + log.info('found pattern at index {}'.format(start_pos+idx))
  1306 + candidates.append(start_pos+idx)
  1307 + idx = buf.find(pattern, idx+1)
  1308 +
  1309 + if len(buf) == BUF_SIZE:
  1310 + # move back a bit to avoid splitting of pattern through buf
  1311 + stream.seek(-1 * pattern_len, os.SEEK_CUR)
  1312 + else:
  1313 + log.debug('reached end of buf (read {}<{}) after {} reads'
  1314 + .format(len(buf), BUF_SIZE, n_reads))
  1315 + break
  1316 + return candidates
  1317 +
  1318 +
  1319 + def search_vba_info(self):
  1320 + """ search through stream for VBAInfoContainer, alternative to parse...
  1321 +
  1322 + quick-and-dirty: do not parse everything, just look for right bytes
  1323 +
  1324 + "quick" here means quick to program. Runtime now is linear is document
  1325 + size (--> for big documents the other method might be faster)
  1326 +
  1327 + .. seealso:: search_vba_storage
  1328 + """
  1329 +
  1330 + pattern = VBAInfoContainer.generate_pattern(
  1331 + rec_len=VBAInfoContainer.RECORD_LENGTH) \
  1332 + + VBAInfoAtom.generate_pattern(
  1333 + rec_len=VBAInfoAtom.RECORD_LENGTH)
1207 1334 stream = None
1208 1335 try:
1209 1336 log.debug('opening stream')
1210 1337 stream = self.ole.openstream(MAIN_STREAM_NAME)
1211 1338  
1212 1339 # look for candidate positions
1213   - n_reads = 0
1214   - candidates = []
1215   - while True:
1216   - start_pos = stream.tell()
1217   - n_reads += 1
1218   - #log.debug('read {} starting from {}'
1219   - # .format(BUF_SIZE, start_pos))
1220   - buf = stream.read(BUF_SIZE)
1221   - idx = buf.find(pattern)
1222   - while idx != -1:
1223   - log.info('found pattern at index {}'.format(start_pos+idx))
1224   - candidates.append(start_pos+idx)
1225   - idx = buf.find(pattern, idx+1)
1226   -
1227   - if len(buf) == BUF_SIZE:
1228   - # move back a bit to avoid splitting of pattern through buf
1229   - stream.seek(-1 * pattern_len, os.SEEK_CUR)
1230   - else:
1231   - log.debug('reached end of buf (read {}<{}) after {} reads'
1232   - .format(len(buf), BUF_SIZE, n_reads))
1233   - break
  1340 + candidates = self.search_pattern(pattern, stream)
1234 1341  
1235 1342 # try parse
  1343 + containers = []
1236 1344 for idx in candidates:
1237 1345 # assume that in stream at idx there is a VBAInfoContainer
1238 1346 stream.seek(idx)
... ... @@ -1252,23 +1360,149 @@ class PptParser(object):
1252 1360 log.info('persist id ref is {}, has_macros {}, version {}'
1253 1361 .format(atom.persist_id_ref, atom.f_has_macros,
1254 1362 atom.version))
  1363 + containers.append(container)
1255 1364 for err in errs:
1256 1365 log.warning('check_validity(VBAInfoContainer): {}'
1257 1366 .format(err))
1258 1367 if errs and self.fast_fail:
1259 1368 raise errs[0]
1260 1369  
  1370 + return containers
  1371 +
1261 1372 finally:
1262 1373 if stream is not None:
1263 1374 log.debug('closing stream')
1264 1375 stream.close()
1265 1376  
  1377 + def search_vba_storage(self):
  1378 + """ search through stream for VBAProjectStg, alternative to parse...
  1379 +
  1380 + quick-and-dirty: do not parse everything, just look for right bytes
  1381 +
  1382 + "quick" here means quick to program. Runtime now is linear is document
  1383 + size (--> for big documents the other method might be faster)
  1384 +
  1385 + The storages found could also contain (instead of VBA data): ActiveX
  1386 + data or general OLE data
  1387 +
  1388 + .. seealso:: :py:meth:`search_vba_info`
  1389 + """
  1390 +
  1391 + stream = None
  1392 + try:
  1393 + log.debug('opening stream')
  1394 + stream = self.ole.openstream(MAIN_STREAM_NAME)
  1395 +
  1396 + storages = []
  1397 + for obj_type in (ExternalObjectStorageUncompressed,
  1398 + ExternalObjectStorageCompressed):
  1399 + # re-position stream at start
  1400 + stream.seek(0, os.SEEK_SET)
  1401 +
  1402 + # look for candidate positions
  1403 + pattern = obj_type.generate_pattern()
  1404 + candidates = self.search_pattern(pattern, stream)
  1405 +
  1406 + # try parse
  1407 + for idx in candidates:
  1408 + # assume a ExternalObjectStorage in stream at idx
  1409 + stream.seek(idx)
  1410 + log.info('extracting at idx {}'.format(idx))
  1411 + try:
  1412 + storage = obj_type.extract_from(stream)
  1413 + except Exception:
  1414 + self._log_exception()
  1415 + continue
  1416 +
  1417 + errs = storage.check_validity()
  1418 + if errs:
  1419 + log.warning('check_validity found {} issues'
  1420 + .format(len(errs)))
  1421 + else:
  1422 + log.info('storage is ok; compressed={}, size={}, '
  1423 + 'size_decomp={}'
  1424 + .format(storage.compressed,
  1425 + storage.rec_head.rec_len,
  1426 + storage.uncompressed_size))
  1427 + storages.append(storage)
  1428 + for err in errs:
  1429 + log.warning('check_validity({}): {}'
  1430 + .format(obj_type.__name__, err))
  1431 + if errs and self.fast_fail:
  1432 + raise errs[0]
  1433 +
  1434 + return storages
  1435 +
  1436 + finally:
  1437 + if stream is not None:
  1438 + log.debug('closing stream')
  1439 + stream.close()
  1440 +
  1441 +
  1442 + def decompress_vba_storage(self, storage):
  1443 + """ return decompressed data from search_vba_storage """
  1444 +
  1445 + log.debug('decompressing storage for VBA OLE data stream ')
  1446 + stream = None
  1447 + try:
  1448 + log.debug('opening stream')
  1449 + stream = self.ole.openstream(MAIN_STREAM_NAME)
  1450 +
  1451 + # decompress iteratively; a zlib.decompress of all data
  1452 + # failed with Error -5 (incomplete or truncated stream)
  1453 + stream.seek(storage.data_offset, os.SEEK_SET)
  1454 + decomp, n_read, err = \
  1455 + iterative_decompress(stream, storage.data_size)
  1456 + log.info('decompressed {} to {} bytes, err is {}'
  1457 + .format(n_read, len(decomp), err))
  1458 + if err and self.fast_fail:
  1459 + raise err
  1460 + # otherwise try to continue with partial data
  1461 +
  1462 + return decomp
  1463 +
  1464 + ## create OleFileIO from decompressed data
  1465 + #ole = olefile.OleFileIO(decomp)
  1466 + #root_streams = [entry[0].lower() for entry in ole.listdir()]
  1467 + #for required in 'project', 'projectwm', 'vba':
  1468 + # if required not in root_streams:
  1469 + # raise ValueError('storage seems to not be a VBA storage '
  1470 + # '({} not found in root streams)'
  1471 + # .format(required))
  1472 + #log.debug('tests succeeded')
  1473 + #return ole
  1474 +
  1475 + finally:
  1476 + if stream is not None:
  1477 + log.debug('closing stream')
  1478 + stream.close()
  1479 +
  1480 +
  1481 +def iterative_decompress(stream, size, chunk_size=4096):
  1482 + """ decompress data from stream chunk-wise """
  1483 +
  1484 + decompressor = zlib.decompressobj()
  1485 + n_read = 0
  1486 + decomp = ''
  1487 + return_err = None
  1488 +
  1489 + try:
  1490 + while n_read < size:
  1491 + n_new = min(size-n_read, chunk_size)
  1492 + decomp += decompressor.decompress(stream.read(n_new))
  1493 + n_read += n_new
  1494 + except zlib.error as err:
  1495 + return_err = err
  1496 +
  1497 + return decomp, n_read, return_err
  1498 +
1266 1499 # === TESTING =================================================================
1267 1500  
1268 1501 def test():
1269 1502 """ for testing and debugging """
1270 1503  
1271 1504 from glob import glob
  1505 + from olevba import VBA_Parser
1272 1506  
1273 1507 # setup logging
1274 1508 logging.basicConfig(level=logging.DEBUG,
... ... @@ -1280,9 +1514,32 @@ def test():
1280 1514 # parse
1281 1515 log.info('-' * 72)
1282 1516 log.info('test file: {}'.format(file_name))
1283   - ppt = PptParser(file_name, fast_fail=False)
1284   - #ppt.parse_document_persist_object()
1285   - ppt.search_vba()
  1517 + try:
  1518 + ppt = PptParser(file_name, fast_fail=False)
  1519 + #ppt.parse_document_persist_object()
  1520 + n_infos = len(ppt.search_vba_info())
  1521 + storages = ppt.search_vba_storage()
  1522 + n_storages = len(storages)
  1523 + log.debug('found {} infos and {} storages'.format(n_infos,
  1524 + n_storages))
  1525 + if n_infos != n_storages:
  1526 + log.warning('found different number of vba infos and storages')
  1527 + for storage in storages:
  1528 + parser = VBA_Parser(None, ppt.decompress_vba_storage(storage),
  1529 + container=file_name)
  1530 + for vba_root, project_path, dir_path in parser.find_vba_projects():
  1531 + log.info('found vba project: root={}, proj={}, dir={}'
  1532 + .format(vba_root, project_path, dir_path))
  1533 + for subfilename, stream_path, vba_filename, vba_code in \
  1534 + parser.extract_all_macros():
  1535 + log.info('found macro: subfile={}, stream={}, vbafile={}'
  1536 + .format(subfilename, stream_path, vba_filename))
  1537 + for line in vba_code.splitlines():
  1538 + log.info('code: {}'.format(line.rstrip()))
  1539 +
  1540 +
  1541 + except Exception:
  1542 + log.exception('exception')
1286 1543  
1287 1544  
1288 1545 if __name__ == '__main__':
... ...