Commit d71220495077b1e0b917829ffa756416ece76764

Authored by Philippe Lagadec
1 parent c47b13c1

olevba: VBA_Parser: split each file format parser into a separate method

Showing 1 changed file with 142 additions and 96 deletions
oletools/olevba.py
@@ -153,8 +153,9 @@ https://github.com/unixfreak0037/officeparser @@ -153,8 +153,9 @@ https://github.com/unixfreak0037/officeparser
153 # - disabled unused option --each 153 # - disabled unused option --each
154 # 2015-09-22 v0.41 PL: - added new option --reveal 154 # 2015-09-22 v0.41 PL: - added new option --reveal
155 # - added suspicious strings for PowerShell.exe options 155 # - added suspicious strings for PowerShell.exe options
  156 +# 2015-10-09 v0.42 PL: - VBA_Parser: split each format into a separate method
156 157
157 -__version__ = '0.41' 158 +__version__ = '0.42'
158 159
159 #------------------------------------------------------------------------------ 160 #------------------------------------------------------------------------------
160 # TODO: 161 # TODO:
@@ -1719,34 +1720,10 @@ class VBA_Parser(object): @@ -1719,34 +1720,10 @@ class VBA_Parser(object):
1719 # self.filename = '<file-like object>' 1720 # self.filename = '<file-like object>'
1720 if olefile.isOleFile(_file): 1721 if olefile.isOleFile(_file):
1721 # This looks like an OLE file 1722 # This looks like an OLE file
1722 - logging.info('Opening OLE file %s' % self.filename)  
1723 - # Open and parse the OLE file, using unicode for path names:  
1724 - self.type = TYPE_OLE  
1725 - # TODO: handle OLE parsing exceptions  
1726 - self.ole_file = olefile.OleFileIO(_file, path_encoding=None)  
1727 - # TODO: raise TypeError if this is a Powerpoint 97 file, since VBA macros cannot be detected yet 1723 + self.open_ole(_file)
1728 elif zipfile.is_zipfile(_file): 1724 elif zipfile.is_zipfile(_file):
1729 - # This looks like a zip file, need to look for vbaProject.bin inside  
1730 - # It can be any OLE file inside the archive  
1731 - #...because vbaProject.bin can be renamed:  
1732 - # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18  
1733 - logging.info('Opening ZIP/OpenXML file %s' % self.filename)  
1734 - self.type = TYPE_OpenXML  
1735 - z = zipfile.ZipFile(_file)  
1736 - #TODO: check if this is actually an OpenXML file  
1737 - #TODO: if the zip file is encrypted, suggest to use the -z option, or try '-z infected' automatically  
1738 - # check each file within the zip if it is an OLE file, by reading its magic:  
1739 - for subfile in z.namelist():  
1740 - magic = z.open(subfile).read(len(olefile.MAGIC))  
1741 - if magic == olefile.MAGIC:  
1742 - logging.debug('Opening OLE file %s within zip' % subfile)  
1743 - ole_data = z.open(subfile).read()  
1744 - try:  
1745 - self.ole_subfiles.append(VBA_Parser(filename=subfile, data=ole_data))  
1746 - except:  
1747 - logging.debug('%s is not a valid OLE file' % subfile)  
1748 - continue  
1749 - z.close() 1725 + # Zip file, which may be an OpenXML document
  1726 + self.open_openxml(_file)
1750 else: 1727 else:
1751 # read file from disk, check if it is a Word 2003 XML file (WordProcessingML), Excel 2003 XML, 1728 # read file from disk, check if it is a Word 2003 XML file (WordProcessingML), Excel 2003 XML,
1752 # or a plain text file containing VBA code 1729 # or a plain text file containing VBA code
@@ -1754,38 +1731,9 @@ class VBA_Parser(object): @@ -1754,38 +1731,9 @@ class VBA_Parser(object):
1754 data = open(filename, 'rb').read() 1731 data = open(filename, 'rb').read()
1755 # store a lowercase version for some tests: 1732 # store a lowercase version for some tests:
1756 data_lowercase = data.lower() 1733 data_lowercase = data.lower()
1757 - # TODO: move each format parser to a separate method  
1758 # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace 1734 # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace
1759 if 'http://schemas.microsoft.com/office/word/2003/wordml' in data: 1735 if 'http://schemas.microsoft.com/office/word/2003/wordml' in data:
1760 - logging.info('Opening Word 2003 XML file %s' % self.filename)  
1761 - try:  
1762 - # parse the XML content  
1763 - # TODO: handle XML parsing exceptions  
1764 - et = ET.fromstring(data)  
1765 - # set type only if parsing succeeds  
1766 - self.type = TYPE_Word2003_XML  
1767 - # find all the binData elements:  
1768 - for bindata in et.getiterator(TAG_BINDATA):  
1769 - # the binData content is an OLE container for the VBA project, compressed  
1770 - # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded.  
1771 - # get the filename:  
1772 - fname = bindata.get(ATTR_NAME, 'noname.mso')  
1773 - # decode the base64 activemime  
1774 - mso_data = binascii.a2b_base64(bindata.text)  
1775 - if is_mso_file(mso_data):  
1776 - # decompress the zlib data stored in the MSO file, which is the OLE container:  
1777 - # TODO: handle different offsets => separate function  
1778 - ole_data = mso_file_extract(mso_data)  
1779 - try:  
1780 - self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data))  
1781 - except:  
1782 - logging.error('%s does not contain a valid OLE file' % fname)  
1783 - else:  
1784 - logging.error('%s is not a valid MSO file' % fname)  
1785 - except:  
1786 - # TODO: differentiate exceptions for each parsing stage  
1787 - logging.exception('Failed XML parsing for file %r' % self.filename)  
1788 - pass 1736 + self.open_word2003xml(data)
1789 # check if it is a MHT file (MIME HTML, Word or Excel saved as "Single File Web Page"): 1737 # check if it is a MHT file (MIME HTML, Word or Excel saved as "Single File Web Page"):
1790 # According to my tests, these files usually start with "MIME-Version: 1.0" on the 1st line 1738 # According to my tests, these files usually start with "MIME-Version: 1.0" on the 1st line
1791 # BUT Word accepts a blank line or other MIME headers inserted before, 1739 # BUT Word accepts a blank line or other MIME headers inserted before,
@@ -1793,44 +1741,7 @@ class VBA_Parser(object): @@ -1793,44 +1741,7 @@ class VBA_Parser(object):
1793 # And the line is case insensitive. 1741 # And the line is case insensitive.
1794 # so we'll just check the presence of mime, version and multipart anywhere: 1742 # so we'll just check the presence of mime, version and multipart anywhere:
1795 if self.type is None and 'mime' in data_lowercase and 'version' in data_lowercase and 'multipart' in data_lowercase: 1743 if self.type is None and 'mime' in data_lowercase and 'version' in data_lowercase and 'multipart' in data_lowercase:
1796 - logging.info('Opening MHTML file %s' % self.filename)  
1797 - try:  
1798 - # parse the MIME content  
1799 - # remove any leading whitespace or newline (workaround for issue in email package)  
1800 - stripped_data = data.lstrip('\r\n\t ')  
1801 - mhtml = email.message_from_string(stripped_data)  
1802 - self.type = TYPE_MHTML  
1803 - # find all the attached files:  
1804 - for part in mhtml.walk():  
1805 - content_type = part.get_content_type() # always returns a value  
1806 - fname = part.get_filename(None) # returns None if it fails  
1807 - # TODO: get content-location if no filename  
1808 - logging.debug('MHTML part: filename=%r, content-type=%r' % (fname, content_type))  
1809 - part_data = part.get_payload(decode=True)  
1810 - # VBA macros are stored in a binary file named "editdata.mso".  
1811 - # the data content is an OLE container for the VBA project, compressed  
1812 - # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded.  
1813 - # decompress the zlib data starting at offset 0x32, which is the OLE container:  
1814 - # check ActiveMime header:  
1815 - if isinstance(part_data, str) and is_mso_file(part_data):  
1816 - logging.debug('Found ActiveMime header, decompressing MSO container')  
1817 - try:  
1818 - ole_data = mso_file_extract(part_data)  
1819 - try:  
1820 - # TODO: check if it is actually an OLE file  
1821 - # TODO: get the MSO filename from content_location?  
1822 - self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data))  
1823 - except:  
1824 - logging.debug('%s does not contain a valid OLE file' % fname)  
1825 - except:  
1826 - logging.exception('Failed decompressing an MSO container in %r - %s'  
1827 - % (fname, MSG_OLEVBA_ISSUES))  
1828 - # TODO: bug here - need to split in smaller functions/classes?  
1829 - except:  
1830 - logging.exception('Failed MIME parsing for file %r - %s'  
1831 - % (self.filename, MSG_OLEVBA_ISSUES))  
1832 - pass  
1833 - 1744 + self.open_mht(data)
1834 #TODO: handle exceptions 1745 #TODO: handle exceptions
1835 #TODO: Excel 2003 XML 1746 #TODO: Excel 2003 XML
1836 #TODO: plain text VBA file 1747 #TODO: plain text VBA file
@@ -1839,6 +1750,141 @@ class VBA_Parser(object): @@ -1839,6 +1750,141 @@ class VBA_Parser(object):
1839 logging.error(msg) 1750 logging.error(msg)
1840 raise TypeError(msg) 1751 raise TypeError(msg)
1841 1752
  1753 + def open_ole(self, _file):
  1754 + """
  1755 + Open an OLE file
  1756 + :param _file: filename or file contents in a file object
  1757 + :return: nothing
  1758 + """
  1759 + logging.info('Opening OLE file %s' % self.filename)
  1760 + try:
  1761 + # Open and parse the OLE file, using unicode for path names:
  1762 + self.ole_file = olefile.OleFileIO(_file, path_encoding=None)
  1763 + # TODO: raise TypeError if this is a Powerpoint 97 file, since VBA macros cannot be detected yet
  1764 + # set type only if parsing succeeds
  1765 + self.type = TYPE_OLE
  1766 + except:
  1767 + # TODO: handle OLE parsing exceptions
  1768 + logging.exception('Failed OLE parsing for file %r' % self.filename)
  1769 + pass
  1770 +
  1771 +
  1772 + def open_openxml(self, _file):
  1773 + """
  1774 + Open an OpenXML file
  1775 + :param _file: filename or file contents in a file object
  1776 + :return: nothing
  1777 + """
  1778 + # This looks like a zip file, need to look for vbaProject.bin inside
  1779 + # It can be any OLE file inside the archive
  1780 + #...because vbaProject.bin can be renamed:
  1781 + # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18
  1782 + logging.info('Opening ZIP/OpenXML file %s' % self.filename)
  1783 + try:
  1784 + z = zipfile.ZipFile(_file)
  1785 + #TODO: check if this is actually an OpenXML file
  1786 + #TODO: if the zip file is encrypted, suggest to use the -z option, or try '-z infected' automatically
  1787 + # check each file within the zip if it is an OLE file, by reading its magic:
  1788 + for subfile in z.namelist():
  1789 + magic = z.open(subfile).read(len(olefile.MAGIC))
  1790 + if magic == olefile.MAGIC:
  1791 + logging.debug('Opening OLE file %s within zip' % subfile)
  1792 + ole_data = z.open(subfile).read()
  1793 + try:
  1794 + self.ole_subfiles.append(VBA_Parser(filename=subfile, data=ole_data))
  1795 + except:
  1796 + logging.debug('%s is not a valid OLE file' % subfile)
  1797 + continue
  1798 + z.close()
  1799 + # set type only if parsing succeeds
  1800 + self.type = TYPE_OpenXML
  1801 + except:
  1802 + # TODO: handle parsing exceptions
  1803 + logging.exception('Failed Zip/OpenXML parsing for file %r' % self.filename)
  1804 + pass
  1805 +
  1806 + def open_word2003xml(self, data):
  1807 + """
  1808 + Open a Word 2003 XML file
  1809 + :param data: file contents in a string or bytes
  1810 + :return: nothing
  1811 + """
  1812 + logging.info('Opening Word 2003 XML file %s' % self.filename)
  1813 + try:
  1814 + # parse the XML content
  1815 + # TODO: handle XML parsing exceptions
  1816 + et = ET.fromstring(data)
  1817 + # find all the binData elements:
  1818 + for bindata in et.getiterator(TAG_BINDATA):
  1819 + # the binData content is an OLE container for the VBA project, compressed
  1820 + # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded.
  1821 + # get the filename:
  1822 + fname = bindata.get(ATTR_NAME, 'noname.mso')
  1823 + # decode the base64 activemime
  1824 + mso_data = binascii.a2b_base64(bindata.text)
  1825 + if is_mso_file(mso_data):
  1826 + # decompress the zlib data stored in the MSO file, which is the OLE container:
  1827 + # TODO: handle different offsets => separate function
  1828 + ole_data = mso_file_extract(mso_data)
  1829 + try:
  1830 + self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data))
  1831 + except:
  1832 + logging.error('%s does not contain a valid OLE file' % fname)
  1833 + else:
  1834 + logging.error('%s is not a valid MSO file' % fname)
  1835 + # set type only if parsing succeeds
  1836 + self.type = TYPE_Word2003_XML
  1837 + except:
  1838 + # TODO: differentiate exceptions for each parsing stage
  1839 + logging.exception('Failed XML parsing for file %r' % self.filename)
  1840 + pass
  1841 +
  1842 + def open_mht(self, data):
  1843 + """
  1844 + Open a MHTML file
  1845 + :param data: file contents in a string or bytes
  1846 + :return: nothing
  1847 + """
  1848 + logging.info('Opening MHTML file %s' % self.filename)
  1849 + try:
  1850 + # parse the MIME content
  1851 + # remove any leading whitespace or newline (workaround for issue in email package)
  1852 + stripped_data = data.lstrip('\r\n\t ')
  1853 + mhtml = email.message_from_string(stripped_data)
  1854 + # find all the attached files:
  1855 + for part in mhtml.walk():
  1856 + content_type = part.get_content_type() # always returns a value
  1857 + fname = part.get_filename(None) # returns None if it fails
  1858 + # TODO: get content-location if no filename
  1859 + logging.debug('MHTML part: filename=%r, content-type=%r' % (fname, content_type))
  1860 + part_data = part.get_payload(decode=True)
  1861 + # VBA macros are stored in a binary file named "editdata.mso".
  1862 + # the data content is an OLE container for the VBA project, compressed
  1863 + # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded.
  1864 + # decompress the zlib data starting at offset 0x32, which is the OLE container:
  1865 + # check ActiveMime header:
  1866 + if isinstance(part_data, str) and is_mso_file(part_data):
  1867 + logging.debug('Found ActiveMime header, decompressing MSO container')
  1868 + try:
  1869 + ole_data = mso_file_extract(part_data)
  1870 + try:
  1871 + # TODO: check if it is actually an OLE file
  1872 + # TODO: get the MSO filename from content_location?
  1873 + self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data))
  1874 + except:
  1875 + logging.debug('%s does not contain a valid OLE file' % fname)
  1876 + except:
  1877 + logging.exception('Failed decompressing an MSO container in %r - %s'
  1878 + % (fname, MSG_OLEVBA_ISSUES))
  1879 + # TODO: bug here - need to split in smaller functions/classes?
  1880 + # set type only if parsing succeeds
  1881 + self.type = TYPE_MHTML
  1882 + except:
  1883 + logging.exception('Failed MIME parsing for file %r - %s'
  1884 + % (self.filename, MSG_OLEVBA_ISSUES))
  1885 + pass
  1886 +
  1887 +
1842 def find_vba_projects(self): 1888 def find_vba_projects(self):
1843 """ 1889 """
1844 Finds all the VBA projects stored in an OLE file. 1890 Finds all the VBA projects stored in an OLE file.