Commit 1c52c0d5430856fcdef9f2d8f4720b0180a8ffa9

Authored by decalage2
1 parent b8c80db7

olevba: synchronized some changes with olevba3 (issue #106)

oletools/olevba.py
... ... @@ -269,6 +269,8 @@ import ppt_parser
269 269 import email.feedparser
270 270 email.feedparser.headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:?|[\t ])')
271 271  
  272 +# === PYTHON 2+3 SUPPORT ======================================================
  273 +
272 274 if sys.version_info[0] <= 2:
273 275 # Python 2.x
274 276 if sys.version_info[1] <= 6:
... ... @@ -281,6 +283,8 @@ if sys.version_info[0] &lt;= 2:
281 283 else:
282 284 # Python 3.x+
283 285 from zipfile import is_zipfile
  286 + # xrange is now called range:
  287 + xrange = range
284 288  
285 289 # === LOGGING =================================================================
286 290  
... ... @@ -443,7 +447,7 @@ TYPE2TAG = {
443 447  
444 448  
445 449 # MSO files ActiveMime header magic
446   -MSO_ACTIVEMIME_HEADER = 'ActiveMime'
  450 +MSO_ACTIVEMIME_HEADER = b'ActiveMime'
447 451  
448 452 MODULE_EXTENSION = "bas"
449 453 CLASS_EXTENSION = "cls"
... ... @@ -2252,7 +2256,7 @@ class VBA_Parser(object):
2252 2256 if data is None:
2253 2257 data = open(filename, 'rb').read()
2254 2258 # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace
2255   - if 'http://schemas.microsoft.com/office/word/2003/wordml' in data:
  2259 + if b'http://schemas.microsoft.com/office/word/2003/wordml' in data:
2256 2260 self.open_word2003xml(data)
2257 2261 # store a lowercase version for the next tests:
2258 2262 data_lowercase = data.lower()
... ... @@ -2262,14 +2266,14 @@ class VBA_Parser(object):
2262 2266 # and even whitespaces in between "MIME", "-", "Version" and ":". The version number is ignored.
2263 2267 # And the line is case insensitive.
2264 2268 # so we'll just check the presence of mime, version and multipart anywhere:
2265   - if self.type is None and 'mime' in data_lowercase and 'version' in data_lowercase \
2266   - and 'multipart' in data_lowercase:
  2269 + if self.type is None and b'mime' in data_lowercase and b'version' in data_lowercase \
  2270 + and b'multipart' in data_lowercase:
2267 2271 self.open_mht(data)
2268 2272 #TODO: handle exceptions
2269 2273 #TODO: Excel 2003 XML
2270 2274 # Check if this is a plain text VBA or VBScript file:
2271 2275 # To avoid scanning binary files, we simply check for some control chars:
2272   - if self.type is None and '\x00' not in data:
  2276 + if self.type is None and b'\x00' not in data:
2273 2277 self.open_text(data)
2274 2278 if self.type is None:
2275 2279 # At this stage, could not match a known format:
... ...
oletools/olevba3.py
... ... @@ -12,6 +12,7 @@ Supported formats:
12 12 - PowerPoint 97-2003 (.ppt), PowerPoint 2007+ (.pptm, .ppsm)
13 13 - Word 2003 XML (.xml)
14 14 - Word/Excel Single File Web Page / MHTML (.mht)
  15 +- Publisher (.pub)
15 16  
16 17 Author: Philippe Lagadec - http://www.decalage.info
17 18 License: BSD, see source code or documentation
... ... @@ -72,6 +73,8 @@ https://github.com/unixfreak0037/officeparser
72 73 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
73 74 # SOFTWARE.
74 75  
  76 +from __future__ import print_function
  77 +
75 78 #------------------------------------------------------------------------------
76 79 # CHANGELOG:
77 80 # 2014-08-05 v0.01 PL: - first version based on officeparser code
... ... @@ -178,9 +181,16 @@ https://github.com/unixfreak0037/officeparser
178 181 # 2016-06-12 v0.50 PL: - fixed small bugs in VBA parsing code
179 182 # 2016-07-01 PL: - fixed issue #58 with format() to support Python 2.6
180 183 # 2016-07-29 CH: - fixed several bugs including #73 (Mac Roman encoding)
  184 +# 2016-08-31 PL: - added autoexec keyword InkPicture_Painted
  185 +# - detect_autoexec now returns the exact keyword found
  186 +# 2016-09-05 PL: - added autoexec keywords for MS Publisher (.pub)
  187 +# 2016-09-06 PL: - fixed issue #20, is_zipfile on Python 2.6
  188 +# 2016-09-12 PL: - enabled packrat to improve pyparsing performance
  189 +# 2016-10-25 PL: - fixed raise and print statements for Python 3
181 190 # 2016-10-25 PL: - fixed regex bytes strings (PR/issue #100)
  191 +# 2016-11-03 v0.51 PL: - added EnumDateFormats and EnumSystemLanguageGroupsW
182 192  
183   -__version__ = '0.50'
  193 +__version__ = '0.51a'
184 194  
185 195 #------------------------------------------------------------------------------
186 196 # TODO:
... ... @@ -260,6 +270,22 @@ import oletools.ppt_parser as ppt_parser
260 270 import email.feedparser
261 271 email.feedparser.headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:?|[\t ])')
262 272  
  273 +# === PYTHON 2+3 SUPPORT ======================================================
  274 +
  275 +if sys.version_info[0] <= 2:
  276 + # Python 2.x
  277 + if sys.version_info[1] <= 6:
  278 + # Python 2.6
  279 + # use is_zipfile backported from Python 2.7:
  280 + from thirdparty.zipfile27 import is_zipfile
  281 + else:
  282 + # Python 2.7
  283 + from zipfile import is_zipfile
  284 +else:
  285 + # Python 3.x+
  286 + from zipfile import is_zipfile
  287 + # xrange is now called range:
  288 + xrange = range
263 289  
264 290 # === LOGGING =================================================================
265 291  
... ... @@ -438,7 +464,7 @@ ATTR_NAME = NS_W + &#39;name&#39;
438 464 AUTOEXEC_KEYWORDS = {
439 465 # MS Word:
440 466 'Runs when the Word document is opened':
441   - ('AutoExec', 'AutoOpen', 'Document_Open', 'DocumentOpen'),
  467 + ('AutoExec', 'AutoOpen', 'DocumentOpen'),
442 468 'Runs when the Word document is closed':
443 469 ('AutoExit', 'AutoClose', 'Document_Close', 'DocumentBeforeClose'),
444 470 'Runs when the Word document is modified':
... ... @@ -446,13 +472,24 @@ AUTOEXEC_KEYWORDS = {
446 472 'Runs when a new Word document is created':
447 473 ('AutoNew', 'Document_New', 'NewDocument'),
448 474  
  475 + # MS Word and Publisher:
  476 + 'Runs when the Word or Publisher document is opened':
  477 + ('Document_Open',),
  478 + 'Runs when the Publisher document is closed':
  479 + ('Document_BeforeClose',),
  480 +
449 481 # MS Excel:
450 482 'Runs when the Excel Workbook is opened':
451 483 ('Auto_Open', 'Workbook_Open', 'Workbook_Activate'),
452 484 'Runs when the Excel Workbook is closed':
453 485 ('Auto_Close', 'Workbook_Close'),
454 486  
455   - #TODO: full list in MS specs??
  487 + # any MS Office application:
  488 + 'Runs when the file is opened (using InkPicture ActiveX object)':
  489 + # ref:https://twitter.com/joe4security/status/770691099988025345
  490 + (r'\w+_Painted',),
  491 + 'Runs when the file is opened and ActiveX objects trigger events':
  492 + (r'\w+_(?:GotFocus|LostFocus|MouseHover)',),
456 493 }
457 494  
458 495 # Suspicious Keywords that may be used by malware
... ... @@ -516,7 +553,11 @@ SUSPICIOUS_KEYWORDS = {
516 553 ('Lib',),
517 554 'May inject code into another process':
518 555 ('CreateThread', 'VirtualAlloc', # (issue #9) suggested by Davy Douhine - used by MSF payload
  556 + 'VirtualAllocEx', 'RtlMoveMemory',
519 557 ),
  558 + 'May run a shellcode in memory':
  559 + ('EnumSystemLanguageGroupsW?', # Used by Hancitor in Oct 2016
  560 + 'EnumDateFormats(?:W|(?:Ex){1,2})?'), # see https://msdn.microsoft.com/en-us/library/windows/desktop/dd317810(v=vs.85).aspx
520 561 'May download files from the Internet':
521 562 #TODO: regex to find urlmon+URLDownloadToFileA on same line
522 563 ('URLDownloadToFileA', 'Msxml2.XMLHTTP', 'Microsoft.XMLHTTP',
... ... @@ -532,7 +573,7 @@ SUSPICIOUS_KEYWORDS = {
532 573 'May attempt to obfuscate malicious function calls':
533 574 ('CallByName',),
534 575 #CallByName: http://msdn.microsoft.com/en-us/library/office/gg278760%28v=office.15%29.aspx
535   - 'May attempt to obfuscate specific strings':
  576 + 'May attempt to obfuscate specific strings (use option --deobf to deobfuscate)':
536 577 #TODO: regex to find several Chr*, not just one
537 578 ('Chr', 'ChrB', 'ChrW', 'StrReverse', 'Xor'),
538 579 #Chr: http://msdn.microsoft.com/en-us/library/office/gg264465%28v=office.15%29.aspx
... ... @@ -571,8 +612,6 @@ SUSPICIOUS_KEYWORDS = {
571 612 'May detect WinJail Sandbox':
572 613 # ref: http://www.cplusplus.com/forum/windows/96874/
573 614 ('Afx:400000:0',),
574   - 'Memory manipulation':
575   - ('VirtualAllocEx', 'RtlMoveMemory'),
576 615 }
577 616  
578 617 # Regular Expression for a URL:
... ... @@ -646,6 +685,10 @@ re_printable_string = re.compile(b&#39;[\\t\\r\\n\\x20-\\xFF]{5,}&#39;)
646 685 # TODO: set whitespaces according to VBA
647 686 # TODO: merge extended lines before parsing
648 687  
  688 +# Enable PackRat for better performance:
  689 +# (see https://pythonhosted.org/pyparsing/pyparsing.ParserElement-class.html#enablePackrat)
  690 +ParserElement.enablePackrat()
  691 +
649 692 # VBA identifier chars (from MS-VBAL 3.3.5)
650 693 vba_identifier_chars = alphanums + '_'
651 694  
... ... @@ -1712,9 +1755,11 @@ def detect_autoexec(vba_code, obfuscation=None):
1712 1755 for keyword in keywords:
1713 1756 #TODO: if keyword is already a compiled regex, use it as-is
1714 1757 # search using regex to detect word boundaries:
1715   - if re.search(r'(?i)\b' + keyword + r'\b', vba_code):
  1758 + match = re.search(r'(?i)\b' + keyword + r'\b', vba_code)
  1759 + if match:
1716 1760 #if keyword.lower() in vba_code:
1717   - results.append((keyword, description + obf_text))
  1761 + found_keyword = match.group()
  1762 + results.append((found_keyword, description + obf_text))
1718 1763 return results
1719 1764  
1720 1765  
... ... @@ -1736,9 +1781,11 @@ def detect_suspicious(vba_code, obfuscation=None):
1736 1781 for description, keywords in SUSPICIOUS_KEYWORDS.items():
1737 1782 for keyword in keywords:
1738 1783 # search using regex to detect word boundaries:
1739   - if re.search(r'(?i)\b' + keyword + r'\b', vba_code):
  1784 + match = re.search(r'(?i)\b' + keyword + r'\b', vba_code)
  1785 + if match:
1740 1786 #if keyword.lower() in vba_code:
1741   - results.append((keyword, description + obf_text))
  1787 + found_keyword = match.group()
  1788 + results.append((found_keyword, description + obf_text))
1742 1789 return results
1743 1790  
1744 1791  
... ... @@ -2203,7 +2250,7 @@ class VBA_Parser(object):
2203 2250  
2204 2251 # if this worked, try whether it is a ppt file (special ole file)
2205 2252 self.open_ppt()
2206   - if self.type is None and zipfile.is_zipfile(_file):
  2253 + if self.type is None and is_zipfile(_file):
2207 2254 # Zip file, which may be an OpenXML document
2208 2255 self.open_openxml(_file)
2209 2256 if self.type is None:
... ... @@ -2606,7 +2653,7 @@ class VBA_Parser(object):
2606 2653 # Also look for VBA code in any stream including orphans
2607 2654 # (happens in some malformed files)
2608 2655 ole = self.ole_file
2609   - for sid in range(len(ole.direntries)):
  2656 + for sid in xrange(len(ole.direntries)):
2610 2657 # check if id is already done above:
2611 2658 log.debug('Checking DirEntry #%d' % sid)
2612 2659 d = ole.direntries[sid]
... ... @@ -2672,7 +2719,7 @@ class VBA_Parser(object):
2672 2719 # Also look for VBA code in any stream including orphans
2673 2720 # (happens in some malformed files)
2674 2721 ole = self.ole_file
2675   - for sid in range(len(ole.direntries)):
  2722 + for sid in xrange(len(ole.direntries)):
2676 2723 # check if id is already done above:
2677 2724 log.debug('Checking DirEntry #%d' % sid)
2678 2725 if sid in vba_stream_ids:
... ... @@ -3099,7 +3146,7 @@ class VBA_Parser_CLI(VBA_Parser):
3099 3146 if self.detect_vba_macros():
3100 3147 # print a waiting message only if the output is not redirected to a file:
3101 3148 if sys.stdout.isatty():
3102   - print('Analysis...\r')
  3149 + print('Analysis...\r', end='')
3103 3150 sys.stdout.flush()
3104 3151 self.analyze_macros(show_decoded_strings=show_decoded_strings,
3105 3152 deobfuscate=deobfuscate)
... ...