Commit 1c52c0d5430856fcdef9f2d8f4720b0180a8ffa9

Authored by decalage2
1 parent b8c80db7

olevba: synchronized some changes with olevba3 (issue #106)

oletools/olevba.py
@@ -269,6 +269,8 @@ import ppt_parser @@ -269,6 +269,8 @@ import ppt_parser
269 import email.feedparser 269 import email.feedparser
270 email.feedparser.headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:?|[\t ])') 270 email.feedparser.headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:?|[\t ])')
271 271
  272 +# === PYTHON 2+3 SUPPORT ======================================================
  273 +
272 if sys.version_info[0] <= 2: 274 if sys.version_info[0] <= 2:
273 # Python 2.x 275 # Python 2.x
274 if sys.version_info[1] <= 6: 276 if sys.version_info[1] <= 6:
@@ -281,6 +283,8 @@ if sys.version_info[0] &lt;= 2: @@ -281,6 +283,8 @@ if sys.version_info[0] &lt;= 2:
281 else: 283 else:
282 # Python 3.x+ 284 # Python 3.x+
283 from zipfile import is_zipfile 285 from zipfile import is_zipfile
  286 + # xrange is now called range:
  287 + xrange = range
284 288
285 # === LOGGING ================================================================= 289 # === LOGGING =================================================================
286 290
@@ -443,7 +447,7 @@ TYPE2TAG = { @@ -443,7 +447,7 @@ TYPE2TAG = {
443 447
444 448
445 # MSO files ActiveMime header magic 449 # MSO files ActiveMime header magic
446 -MSO_ACTIVEMIME_HEADER = 'ActiveMime' 450 +MSO_ACTIVEMIME_HEADER = b'ActiveMime'
447 451
448 MODULE_EXTENSION = "bas" 452 MODULE_EXTENSION = "bas"
449 CLASS_EXTENSION = "cls" 453 CLASS_EXTENSION = "cls"
@@ -2252,7 +2256,7 @@ class VBA_Parser(object): @@ -2252,7 +2256,7 @@ class VBA_Parser(object):
2252 if data is None: 2256 if data is None:
2253 data = open(filename, 'rb').read() 2257 data = open(filename, 'rb').read()
2254 # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace 2258 # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace
2255 - if 'http://schemas.microsoft.com/office/word/2003/wordml' in data: 2259 + if b'http://schemas.microsoft.com/office/word/2003/wordml' in data:
2256 self.open_word2003xml(data) 2260 self.open_word2003xml(data)
2257 # store a lowercase version for the next tests: 2261 # store a lowercase version for the next tests:
2258 data_lowercase = data.lower() 2262 data_lowercase = data.lower()
@@ -2262,14 +2266,14 @@ class VBA_Parser(object): @@ -2262,14 +2266,14 @@ class VBA_Parser(object):
2262 # and even whitespaces in between "MIME", "-", "Version" and ":". The version number is ignored. 2266 # and even whitespaces in between "MIME", "-", "Version" and ":". The version number is ignored.
2263 # And the line is case insensitive. 2267 # And the line is case insensitive.
2264 # so we'll just check the presence of mime, version and multipart anywhere: 2268 # so we'll just check the presence of mime, version and multipart anywhere:
2265 - if self.type is None and 'mime' in data_lowercase and 'version' in data_lowercase \  
2266 - and 'multipart' in data_lowercase: 2269 + if self.type is None and b'mime' in data_lowercase and b'version' in data_lowercase \
  2270 + and b'multipart' in data_lowercase:
2267 self.open_mht(data) 2271 self.open_mht(data)
2268 #TODO: handle exceptions 2272 #TODO: handle exceptions
2269 #TODO: Excel 2003 XML 2273 #TODO: Excel 2003 XML
2270 # Check if this is a plain text VBA or VBScript file: 2274 # Check if this is a plain text VBA or VBScript file:
2271 # To avoid scanning binary files, we simply check for some control chars: 2275 # To avoid scanning binary files, we simply check for some control chars:
2272 - if self.type is None and '\x00' not in data: 2276 + if self.type is None and b'\x00' not in data:
2273 self.open_text(data) 2277 self.open_text(data)
2274 if self.type is None: 2278 if self.type is None:
2275 # At this stage, could not match a known format: 2279 # At this stage, could not match a known format:
oletools/olevba3.py
@@ -12,6 +12,7 @@ Supported formats: @@ -12,6 +12,7 @@ Supported formats:
12 - PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm) 12 - PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm)
13 - Word 2003 XML (.xml) 13 - Word 2003 XML (.xml)
14 - Word/Excel Single File Web Page / MHTML (.mht) 14 - Word/Excel Single File Web Page / MHTML (.mht)
  15 +- Publisher (.pub)
15 16
16 Author: Philippe Lagadec - http://www.decalage.info 17 Author: Philippe Lagadec - http://www.decalage.info
17 License: BSD, see source code or documentation 18 License: BSD, see source code or documentation
@@ -72,6 +73,8 @@ https://github.com/unixfreak0037/officeparser @@ -72,6 +73,8 @@ https://github.com/unixfreak0037/officeparser
72 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 73 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
73 # SOFTWARE. 74 # SOFTWARE.
74 75
  76 +from __future__ import print_function
  77 +
75 #------------------------------------------------------------------------------ 78 #------------------------------------------------------------------------------
76 # CHANGELOG: 79 # CHANGELOG:
77 # 2014-08-05 v0.01 PL: - first version based on officeparser code 80 # 2014-08-05 v0.01 PL: - first version based on officeparser code
@@ -178,9 +181,16 @@ https://github.com/unixfreak0037/officeparser @@ -178,9 +181,16 @@ https://github.com/unixfreak0037/officeparser
178 # 2016-06-12 v0.50 PL: - fixed small bugs in VBA parsing code 181 # 2016-06-12 v0.50 PL: - fixed small bugs in VBA parsing code
179 # 2016-07-01 PL: - fixed issue #58 with format() to support Python 2.6 182 # 2016-07-01 PL: - fixed issue #58 with format() to support Python 2.6
180 # 2016-07-29 CH: - fixed several bugs including #73 (Mac Roman encoding) 183 # 2016-07-29 CH: - fixed several bugs including #73 (Mac Roman encoding)
  184 +# 2016-08-31 PL: - added autoexec keyword InkPicture_Painted
  185 +# - detect_autoexec now returns the exact keyword found
  186 +# 2016-09-05 PL: - added autoexec keywords for MS Publisher (.pub)
  187 +# 2016-09-06 PL: - fixed issue #20, is_zipfile on Python 2.6
  188 +# 2016-09-12 PL: - enabled packrat to improve pyparsing performance
  189 +# 2016-10-25 PL: - fixed raise and print statements for Python 3
181 # 2016-10-25 PL: - fixed regex bytes strings (PR/issue #100) 190 # 2016-10-25 PL: - fixed regex bytes strings (PR/issue #100)
  191 +# 2016-11-03 v0.51 PL: - added EnumDateFormats and EnumSystemLanguageGroupsW
182 192
183 -__version__ = '0.50' 193 +__version__ = '0.51a'
184 194
185 #------------------------------------------------------------------------------ 195 #------------------------------------------------------------------------------
186 # TODO: 196 # TODO:
@@ -260,6 +270,22 @@ import oletools.ppt_parser as ppt_parser @@ -260,6 +270,22 @@ import oletools.ppt_parser as ppt_parser
260 import email.feedparser 270 import email.feedparser
261 email.feedparser.headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:?|[\t ])') 271 email.feedparser.headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:?|[\t ])')
262 272
  273 +# === PYTHON 2+3 SUPPORT ======================================================
  274 +
  275 +if sys.version_info[0] <= 2:
  276 + # Python 2.x
  277 + if sys.version_info[1] <= 6:
  278 + # Python 2.6
  279 + # use is_zipfile backported from Python 2.7:
  280 + from thirdparty.zipfile27 import is_zipfile
  281 + else:
  282 + # Python 2.7
  283 + from zipfile import is_zipfile
  284 +else:
  285 + # Python 3.x+
  286 + from zipfile import is_zipfile
  287 + # xrange is now called range:
  288 + xrange = range
263 289
264 # === LOGGING ================================================================= 290 # === LOGGING =================================================================
265 291
@@ -438,7 +464,7 @@ ATTR_NAME = NS_W + &#39;name&#39; @@ -438,7 +464,7 @@ ATTR_NAME = NS_W + &#39;name&#39;
438 AUTOEXEC_KEYWORDS = { 464 AUTOEXEC_KEYWORDS = {
439 # MS Word: 465 # MS Word:
440 'Runs when the Word document is opened': 466 'Runs when the Word document is opened':
441 - ('AutoExec', 'AutoOpen', 'Document_Open', 'DocumentOpen'), 467 + ('AutoExec', 'AutoOpen', 'DocumentOpen'),
442 'Runs when the Word document is closed': 468 'Runs when the Word document is closed':
443 ('AutoExit', 'AutoClose', 'Document_Close', 'DocumentBeforeClose'), 469 ('AutoExit', 'AutoClose', 'Document_Close', 'DocumentBeforeClose'),
444 'Runs when the Word document is modified': 470 'Runs when the Word document is modified':
@@ -446,13 +472,24 @@ AUTOEXEC_KEYWORDS = { @@ -446,13 +472,24 @@ AUTOEXEC_KEYWORDS = {
446 'Runs when a new Word document is created': 472 'Runs when a new Word document is created':
447 ('AutoNew', 'Document_New', 'NewDocument'), 473 ('AutoNew', 'Document_New', 'NewDocument'),
448 474
  475 + # MS Word and Publisher:
  476 + 'Runs when the Word or Publisher document is opened':
  477 + ('Document_Open',),
  478 + 'Runs when the Publisher document is closed':
  479 + ('Document_BeforeClose',),
  480 +
449 # MS Excel: 481 # MS Excel:
450 'Runs when the Excel Workbook is opened': 482 'Runs when the Excel Workbook is opened':
451 ('Auto_Open', 'Workbook_Open', 'Workbook_Activate'), 483 ('Auto_Open', 'Workbook_Open', 'Workbook_Activate'),
452 'Runs when the Excel Workbook is closed': 484 'Runs when the Excel Workbook is closed':
453 ('Auto_Close', 'Workbook_Close'), 485 ('Auto_Close', 'Workbook_Close'),
454 486
455 - #TODO: full list in MS specs?? 487 + # any MS Office application:
  488 + 'Runs when the file is opened (using InkPicture ActiveX object)':
  489 + # ref:https://twitter.com/joe4security/status/770691099988025345
  490 + (r'\w+_Painted',),
  491 + 'Runs when the file is opened and ActiveX objects trigger events':
  492 + (r'\w+_(?:GotFocus|LostFocus|MouseHover)',),
456 } 493 }
457 494
458 # Suspicious Keywords that may be used by malware 495 # Suspicious Keywords that may be used by malware
@@ -516,7 +553,11 @@ SUSPICIOUS_KEYWORDS = { @@ -516,7 +553,11 @@ SUSPICIOUS_KEYWORDS = {
516 ('Lib',), 553 ('Lib',),
517 'May inject code into another process': 554 'May inject code into another process':
518 ('CreateThread', 'VirtualAlloc', # (issue #9) suggested by Davy Douhine - used by MSF payload 555 ('CreateThread', 'VirtualAlloc', # (issue #9) suggested by Davy Douhine - used by MSF payload
  556 + 'VirtualAllocEx', 'RtlMoveMemory',
519 ), 557 ),
  558 + 'May run a shellcode in memory':
  559 + ('EnumSystemLanguageGroupsW?', # Used by Hancitor in Oct 2016
  560 + 'EnumDateFormats(?:W|(?:Ex){1,2})?'), # see https://msdn.microsoft.com/en-us/library/windows/desktop/dd317810(v=vs.85).aspx
520 'May download files from the Internet': 561 'May download files from the Internet':
521 #TODO: regex to find urlmon+URLDownloadToFileA on same line 562 #TODO: regex to find urlmon+URLDownloadToFileA on same line
522 ('URLDownloadToFileA', 'Msxml2.XMLHTTP', 'Microsoft.XMLHTTP', 563 ('URLDownloadToFileA', 'Msxml2.XMLHTTP', 'Microsoft.XMLHTTP',
@@ -532,7 +573,7 @@ SUSPICIOUS_KEYWORDS = { @@ -532,7 +573,7 @@ SUSPICIOUS_KEYWORDS = {
532 'May attempt to obfuscate malicious function calls': 573 'May attempt to obfuscate malicious function calls':
533 ('CallByName',), 574 ('CallByName',),
534 #CallByName: http://msdn.microsoft.com/en-us/library/office/gg278760%28v=office.15%29.aspx 575 #CallByName: http://msdn.microsoft.com/en-us/library/office/gg278760%28v=office.15%29.aspx
535 - 'May attempt to obfuscate specific strings': 576 + 'May attempt to obfuscate specific strings (use option --deobf to deobfuscate)':
536 #TODO: regex to find several Chr*, not just one 577 #TODO: regex to find several Chr*, not just one
537 ('Chr', 'ChrB', 'ChrW', 'StrReverse', 'Xor'), 578 ('Chr', 'ChrB', 'ChrW', 'StrReverse', 'Xor'),
538 #Chr: http://msdn.microsoft.com/en-us/library/office/gg264465%28v=office.15%29.aspx 579 #Chr: http://msdn.microsoft.com/en-us/library/office/gg264465%28v=office.15%29.aspx
@@ -571,8 +612,6 @@ SUSPICIOUS_KEYWORDS = { @@ -571,8 +612,6 @@ SUSPICIOUS_KEYWORDS = {
571 'May detect WinJail Sandbox': 612 'May detect WinJail Sandbox':
572 # ref: http://www.cplusplus.com/forum/windows/96874/ 613 # ref: http://www.cplusplus.com/forum/windows/96874/
573 ('Afx:400000:0',), 614 ('Afx:400000:0',),
574 - 'Memory manipulation':  
575 - ('VirtualAllocEx', 'RtlMoveMemory'),  
576 } 615 }
577 616
578 # Regular Expression for a URL: 617 # Regular Expression for a URL:
@@ -646,6 +685,10 @@ re_printable_string = re.compile(b&#39;[\\t\\r\\n\\x20-\\xFF]{5,}&#39;) @@ -646,6 +685,10 @@ re_printable_string = re.compile(b&#39;[\\t\\r\\n\\x20-\\xFF]{5,}&#39;)
646 # TODO: set whitespaces according to VBA 685 # TODO: set whitespaces according to VBA
647 # TODO: merge extended lines before parsing 686 # TODO: merge extended lines before parsing
648 687
  688 +# Enable PackRat for better performance:
  689 +# (see https://pythonhosted.org/pyparsing/pyparsing.ParserElement-class.html#enablePackrat)
  690 +ParserElement.enablePackrat()
  691 +
649 # VBA identifier chars (from MS-VBAL 3.3.5) 692 # VBA identifier chars (from MS-VBAL 3.3.5)
650 vba_identifier_chars = alphanums + '_' 693 vba_identifier_chars = alphanums + '_'
651 694
@@ -1712,9 +1755,11 @@ def detect_autoexec(vba_code, obfuscation=None): @@ -1712,9 +1755,11 @@ def detect_autoexec(vba_code, obfuscation=None):
1712 for keyword in keywords: 1755 for keyword in keywords:
1713 #TODO: if keyword is already a compiled regex, use it as-is 1756 #TODO: if keyword is already a compiled regex, use it as-is
1714 # search using regex to detect word boundaries: 1757 # search using regex to detect word boundaries:
1715 - if re.search(r'(?i)\b' + keyword + r'\b', vba_code): 1758 + match = re.search(r'(?i)\b' + keyword + r'\b', vba_code)
  1759 + if match:
1716 #if keyword.lower() in vba_code: 1760 #if keyword.lower() in vba_code:
1717 - results.append((keyword, description + obf_text)) 1761 + found_keyword = match.group()
  1762 + results.append((found_keyword, description + obf_text))
1718 return results 1763 return results
1719 1764
1720 1765
@@ -1736,9 +1781,11 @@ def detect_suspicious(vba_code, obfuscation=None): @@ -1736,9 +1781,11 @@ def detect_suspicious(vba_code, obfuscation=None):
1736 for description, keywords in SUSPICIOUS_KEYWORDS.items(): 1781 for description, keywords in SUSPICIOUS_KEYWORDS.items():
1737 for keyword in keywords: 1782 for keyword in keywords:
1738 # search using regex to detect word boundaries: 1783 # search using regex to detect word boundaries:
1739 - if re.search(r'(?i)\b' + keyword + r'\b', vba_code): 1784 + match = re.search(r'(?i)\b' + keyword + r'\b', vba_code)
  1785 + if match:
1740 #if keyword.lower() in vba_code: 1786 #if keyword.lower() in vba_code:
1741 - results.append((keyword, description + obf_text)) 1787 + found_keyword = match.group()
  1788 + results.append((found_keyword, description + obf_text))
1742 return results 1789 return results
1743 1790
1744 1791
@@ -2203,7 +2250,7 @@ class VBA_Parser(object): @@ -2203,7 +2250,7 @@ class VBA_Parser(object):
2203 2250
2204 # if this worked, try whether it is a ppt file (special ole file) 2251 # if this worked, try whether it is a ppt file (special ole file)
2205 self.open_ppt() 2252 self.open_ppt()
2206 - if self.type is None and zipfile.is_zipfile(_file): 2253 + if self.type is None and is_zipfile(_file):
2207 # Zip file, which may be an OpenXML document 2254 # Zip file, which may be an OpenXML document
2208 self.open_openxml(_file) 2255 self.open_openxml(_file)
2209 if self.type is None: 2256 if self.type is None:
@@ -2606,7 +2653,7 @@ class VBA_Parser(object): @@ -2606,7 +2653,7 @@ class VBA_Parser(object):
2606 # Also look for VBA code in any stream including orphans 2653 # Also look for VBA code in any stream including orphans
2607 # (happens in some malformed files) 2654 # (happens in some malformed files)
2608 ole = self.ole_file 2655 ole = self.ole_file
2609 - for sid in range(len(ole.direntries)): 2656 + for sid in xrange(len(ole.direntries)):
2610 # check if id is already done above: 2657 # check if id is already done above:
2611 log.debug('Checking DirEntry #%d' % sid) 2658 log.debug('Checking DirEntry #%d' % sid)
2612 d = ole.direntries[sid] 2659 d = ole.direntries[sid]
@@ -2672,7 +2719,7 @@ class VBA_Parser(object): @@ -2672,7 +2719,7 @@ class VBA_Parser(object):
2672 # Also look for VBA code in any stream including orphans 2719 # Also look for VBA code in any stream including orphans
2673 # (happens in some malformed files) 2720 # (happens in some malformed files)
2674 ole = self.ole_file 2721 ole = self.ole_file
2675 - for sid in range(len(ole.direntries)): 2722 + for sid in xrange(len(ole.direntries)):
2676 # check if id is already done above: 2723 # check if id is already done above:
2677 log.debug('Checking DirEntry #%d' % sid) 2724 log.debug('Checking DirEntry #%d' % sid)
2678 if sid in vba_stream_ids: 2725 if sid in vba_stream_ids:
@@ -3099,7 +3146,7 @@ class VBA_Parser_CLI(VBA_Parser): @@ -3099,7 +3146,7 @@ class VBA_Parser_CLI(VBA_Parser):
3099 if self.detect_vba_macros(): 3146 if self.detect_vba_macros():
3100 # print a waiting message only if the output is not redirected to a file: 3147 # print a waiting message only if the output is not redirected to a file:
3101 if sys.stdout.isatty(): 3148 if sys.stdout.isatty():
3102 - print('Analysis...\r') 3149 + print('Analysis...\r', end='')
3103 sys.stdout.flush() 3150 sys.stdout.flush()
3104 self.analyze_macros(show_decoded_strings=show_decoded_strings, 3151 self.analyze_macros(show_decoded_strings=show_decoded_strings,
3105 deobfuscate=deobfuscate) 3152 deobfuscate=deobfuscate)