Commit d71220495077b1e0b917829ffa756416ece76764

Authored by Philippe Lagadec
1 parent c47b13c1

olevba: VBA_Parser: split each file format parser into a separate method

Showing 1 changed file with 142 additions and 96 deletions
oletools/olevba.py
... ... @@ -153,8 +153,9 @@ https://github.com/unixfreak0037/officeparser
153 153 # - disabled unused option --each
154 154 # 2015-09-22 v0.41 PL: - added new option --reveal
155 155 # - added suspicious strings for PowerShell.exe options
  156 +# 2015-10-09 v0.42 PL: - VBA_Parser: split each format into a separate method
156 157  
157   -__version__ = '0.41'
  158 +__version__ = '0.42'
158 159  
159 160 #------------------------------------------------------------------------------
160 161 # TODO:
... ... @@ -1719,34 +1720,10 @@ class VBA_Parser(object):
1719 1720 # self.filename = '<file-like object>'
1720 1721 if olefile.isOleFile(_file):
1721 1722 # This looks like an OLE file
1722   - logging.info('Opening OLE file %s' % self.filename)
1723   - # Open and parse the OLE file, using unicode for path names:
1724   - self.type = TYPE_OLE
1725   - # TODO: handle OLE parsing exceptions
1726   - self.ole_file = olefile.OleFileIO(_file, path_encoding=None)
1727   - # TODO: raise TypeError if this is a Powerpoint 97 file, since VBA macros cannot be detected yet
  1723 + self.open_ole(_file)
1728 1724 elif zipfile.is_zipfile(_file):
1729   - # This looks like a zip file, need to look for vbaProject.bin inside
1730   - # It can be any OLE file inside the archive
1731   - #...because vbaProject.bin can be renamed:
1732   - # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18
1733   - logging.info('Opening ZIP/OpenXML file %s' % self.filename)
1734   - self.type = TYPE_OpenXML
1735   - z = zipfile.ZipFile(_file)
1736   - #TODO: check if this is actually an OpenXML file
1737   - #TODO: if the zip file is encrypted, suggest to use the -z option, or try '-z infected' automatically
1738   - # check each file within the zip if it is an OLE file, by reading its magic:
1739   - for subfile in z.namelist():
1740   - magic = z.open(subfile).read(len(olefile.MAGIC))
1741   - if magic == olefile.MAGIC:
1742   - logging.debug('Opening OLE file %s within zip' % subfile)
1743   - ole_data = z.open(subfile).read()
1744   - try:
1745   - self.ole_subfiles.append(VBA_Parser(filename=subfile, data=ole_data))
1746   - except:
1747   - logging.debug('%s is not a valid OLE file' % subfile)
1748   - continue
1749   - z.close()
  1725 + # Zip file, which may be an OpenXML document
  1726 + self.open_openxml(_file)
1750 1727 else:
1751 1728 # read file from disk, check if it is a Word 2003 XML file (WordProcessingML), Excel 2003 XML,
1752 1729 # or a plain text file containing VBA code
... ... @@ -1754,38 +1731,9 @@ class VBA_Parser(object):
1754 1731 data = open(filename, 'rb').read()
1755 1732 # store a lowercase version for some tests:
1756 1733 data_lowercase = data.lower()
1757   - # TODO: move each format parser to a separate method
1758 1734 # check if it is a Word 2003 XML file (WordProcessingML): must contain the namespace
1759 1735 if 'http://schemas.microsoft.com/office/word/2003/wordml' in data:
1760   - logging.info('Opening Word 2003 XML file %s' % self.filename)
1761   - try:
1762   - # parse the XML content
1763   - # TODO: handle XML parsing exceptions
1764   - et = ET.fromstring(data)
1765   - # set type only if parsing succeeds
1766   - self.type = TYPE_Word2003_XML
1767   - # find all the binData elements:
1768   - for bindata in et.getiterator(TAG_BINDATA):
1769   - # the binData content is an OLE container for the VBA project, compressed
1770   - # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded.
1771   - # get the filename:
1772   - fname = bindata.get(ATTR_NAME, 'noname.mso')
1773   - # decode the base64 activemime
1774   - mso_data = binascii.a2b_base64(bindata.text)
1775   - if is_mso_file(mso_data):
1776   - # decompress the zlib data stored in the MSO file, which is the OLE container:
1777   - # TODO: handle different offsets => separate function
1778   - ole_data = mso_file_extract(mso_data)
1779   - try:
1780   - self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data))
1781   - except:
1782   - logging.error('%s does not contain a valid OLE file' % fname)
1783   - else:
1784   - logging.error('%s is not a valid MSO file' % fname)
1785   - except:
1786   - # TODO: differentiate exceptions for each parsing stage
1787   - logging.exception('Failed XML parsing for file %r' % self.filename)
1788   - pass
  1736 + self.open_word2003xml(data)
1789 1737 # check if it is a MHT file (MIME HTML, Word or Excel saved as "Single File Web Page"):
1790 1738 # According to my tests, these files usually start with "MIME-Version: 1.0" on the 1st line
1791 1739 # BUT Word accepts a blank line or other MIME headers inserted before,
... ... @@ -1793,44 +1741,7 @@ class VBA_Parser(object):
1793 1741 # And the line is case insensitive.
1794 1742 # so we'll just check the presence of mime, version and multipart anywhere:
1795 1743 if self.type is None and 'mime' in data_lowercase and 'version' in data_lowercase and 'multipart' in data_lowercase:
1796   - logging.info('Opening MHTML file %s' % self.filename)
1797   - try:
1798   - # parse the MIME content
1799   - # remove any leading whitespace or newline (workaround for issue in email package)
1800   - stripped_data = data.lstrip('\r\n\t ')
1801   - mhtml = email.message_from_string(stripped_data)
1802   - self.type = TYPE_MHTML
1803   - # find all the attached files:
1804   - for part in mhtml.walk():
1805   - content_type = part.get_content_type() # always returns a value
1806   - fname = part.get_filename(None) # returns None if it fails
1807   - # TODO: get content-location if no filename
1808   - logging.debug('MHTML part: filename=%r, content-type=%r' % (fname, content_type))
1809   - part_data = part.get_payload(decode=True)
1810   - # VBA macros are stored in a binary file named "editdata.mso".
1811   - # the data content is an OLE container for the VBA project, compressed
1812   - # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded.
1813   - # decompress the zlib data starting at offset 0x32, which is the OLE container:
1814   - # check ActiveMime header:
1815   - if isinstance(part_data, str) and is_mso_file(part_data):
1816   - logging.debug('Found ActiveMime header, decompressing MSO container')
1817   - try:
1818   - ole_data = mso_file_extract(part_data)
1819   - try:
1820   - # TODO: check if it is actually an OLE file
1821   - # TODO: get the MSO filename from content_location?
1822   - self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data))
1823   - except:
1824   - logging.debug('%s does not contain a valid OLE file' % fname)
1825   - except:
1826   - logging.exception('Failed decompressing an MSO container in %r - %s'
1827   - % (fname, MSG_OLEVBA_ISSUES))
1828   - # TODO: bug here - need to split in smaller functions/classes?
1829   - except:
1830   - logging.exception('Failed MIME parsing for file %r - %s'
1831   - % (self.filename, MSG_OLEVBA_ISSUES))
1832   - pass
1833   -
  1744 + self.open_mht(data)
1834 1745 #TODO: handle exceptions
1835 1746 #TODO: Excel 2003 XML
1836 1747 #TODO: plain text VBA file
... ... @@ -1839,6 +1750,141 @@ class VBA_Parser(object):
1839 1750 logging.error(msg)
1840 1751 raise TypeError(msg)
1841 1752  
  1753 + def open_ole(self, _file):
  1754 + """
  1755 + Open an OLE file
  1756 + :param _file: filename or file contents in a file object
  1757 + :return: nothing
  1758 + """
  1759 + logging.info('Opening OLE file %s' % self.filename)
  1760 + try:
  1761 + # Open and parse the OLE file, using unicode for path names:
  1762 + self.ole_file = olefile.OleFileIO(_file, path_encoding=None)
  1763 + # TODO: raise TypeError if this is a Powerpoint 97 file, since VBA macros cannot be detected yet
  1764 + # set type only if parsing succeeds
  1765 + self.type = TYPE_OLE
  1766 + except:
  1767 + # TODO: handle OLE parsing exceptions
  1768 + logging.exception('Failed OLE parsing for file %r' % self.filename)
  1769 + pass
  1770 +
  1771 +
  1772 + def open_openxml(self, _file):
  1773 + """
  1774 + Open an OpenXML file
  1775 + :param _file: filename or file contents in a file object
  1776 + :return: nothing
  1777 + """
  1778 + # This looks like a zip file, need to look for vbaProject.bin inside
  1779 + # It can be any OLE file inside the archive
  1780 + #...because vbaProject.bin can be renamed:
  1781 + # see http://www.decalage.info/files/JCV07_Lagadec_OpenDocument_OpenXML_v4_decalage.pdf#page=18
  1782 + logging.info('Opening ZIP/OpenXML file %s' % self.filename)
  1783 + try:
  1784 + z = zipfile.ZipFile(_file)
  1785 + #TODO: check if this is actually an OpenXML file
  1786 + #TODO: if the zip file is encrypted, suggest to use the -z option, or try '-z infected' automatically
  1787 + # check each file within the zip if it is an OLE file, by reading its magic:
  1788 + for subfile in z.namelist():
  1789 + magic = z.open(subfile).read(len(olefile.MAGIC))
  1790 + if magic == olefile.MAGIC:
  1791 + logging.debug('Opening OLE file %s within zip' % subfile)
  1792 + ole_data = z.open(subfile).read()
  1793 + try:
  1794 + self.ole_subfiles.append(VBA_Parser(filename=subfile, data=ole_data))
  1795 + except:
  1796 + logging.debug('%s is not a valid OLE file' % subfile)
  1797 + continue
  1798 + z.close()
  1799 + # set type only if parsing succeeds
  1800 + self.type = TYPE_OpenXML
  1801 + except:
  1802 + # TODO: handle parsing exceptions
  1803 + logging.exception('Failed Zip/OpenXML parsing for file %r' % self.filename)
  1804 + pass
  1805 +
  1806 + def open_word2003xml(self, data):
  1807 + """
  1808 + Open a Word 2003 XML file
  1809 + :param data: file contents in a string or bytes
  1810 + :return: nothing
  1811 + """
  1812 + logging.info('Opening Word 2003 XML file %s' % self.filename)
  1813 + try:
  1814 + # parse the XML content
  1815 + # TODO: handle XML parsing exceptions
  1816 + et = ET.fromstring(data)
  1817 + # find all the binData elements:
  1818 + for bindata in et.getiterator(TAG_BINDATA):
  1819 + # the binData content is an OLE container for the VBA project, compressed
  1820 + # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded.
  1821 + # get the filename:
  1822 + fname = bindata.get(ATTR_NAME, 'noname.mso')
  1823 + # decode the base64 activemime
  1824 + mso_data = binascii.a2b_base64(bindata.text)
  1825 + if is_mso_file(mso_data):
  1826 + # decompress the zlib data stored in the MSO file, which is the OLE container:
  1827 + # TODO: handle different offsets => separate function
  1828 + ole_data = mso_file_extract(mso_data)
  1829 + try:
  1830 + self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data))
  1831 + except:
  1832 + logging.error('%s does not contain a valid OLE file' % fname)
  1833 + else:
  1834 + logging.error('%s is not a valid MSO file' % fname)
  1835 + # set type only if parsing succeeds
  1836 + self.type = TYPE_Word2003_XML
  1837 + except:
  1838 + # TODO: differentiate exceptions for each parsing stage
  1839 + logging.exception('Failed XML parsing for file %r' % self.filename)
  1840 + pass
  1841 +
  1842 + def open_mht(self, data):
  1843 + """
  1844 + Open a MHTML file
  1845 + :param data: file contents in a string or bytes
  1846 + :return: nothing
  1847 + """
  1848 + logging.info('Opening MHTML file %s' % self.filename)
  1849 + try:
  1850 + # parse the MIME content
  1851 + # remove any leading whitespace or newline (workaround for issue in email package)
  1852 + stripped_data = data.lstrip('\r\n\t ')
  1853 + mhtml = email.message_from_string(stripped_data)
  1854 + # find all the attached files:
  1855 + for part in mhtml.walk():
  1856 + content_type = part.get_content_type() # always returns a value
  1857 + fname = part.get_filename(None) # returns None if it fails
  1858 + # TODO: get content-location if no filename
  1859 + logging.debug('MHTML part: filename=%r, content-type=%r' % (fname, content_type))
  1860 + part_data = part.get_payload(decode=True)
  1861 + # VBA macros are stored in a binary file named "editdata.mso".
  1862 + # the data content is an OLE container for the VBA project, compressed
  1863 + # using the ActiveMime/MSO format (zlib-compressed), and Base64 encoded.
  1864 + # decompress the zlib data starting at offset 0x32, which is the OLE container:
  1865 + # check ActiveMime header:
  1866 + if isinstance(part_data, str) and is_mso_file(part_data):
  1867 + logging.debug('Found ActiveMime header, decompressing MSO container')
  1868 + try:
  1869 + ole_data = mso_file_extract(part_data)
  1870 + try:
  1871 + # TODO: check if it is actually an OLE file
  1872 + # TODO: get the MSO filename from content_location?
  1873 + self.ole_subfiles.append(VBA_Parser(filename=fname, data=ole_data))
  1874 + except:
  1875 + logging.debug('%s does not contain a valid OLE file' % fname)
  1876 + except:
  1877 + logging.exception('Failed decompressing an MSO container in %r - %s'
  1878 + % (fname, MSG_OLEVBA_ISSUES))
  1879 + # TODO: bug here - need to split in smaller functions/classes?
  1880 + # set type only if parsing succeeds
  1881 + self.type = TYPE_MHTML
  1882 + except:
  1883 + logging.exception('Failed MIME parsing for file %r - %s'
  1884 + % (self.filename, MSG_OLEVBA_ISSUES))
  1885 + pass
  1886 +
  1887 +
1842 1888 def find_vba_projects(self):
1843 1889 """
1844 1890 Finds all the VBA projects stored in an OLE file.
... ...