Commit 5fe1f07084f2f6e192899884c6345b2131efd2ff

Authored by Christian Herdtweck
1 parent fe87211b

msodde: limit csv delimiters to reasonable set

The python sniffer would find "i" as delimiter in text or "<" in xml. We
prefer an error over misinterpretation.

Also, try all delimiters, not just a second one.

Rename one constant (added CSV_)
Showing 1 changed file with 22 additions and 16 deletions
oletools/msodde.py
... ... @@ -845,7 +845,11 @@ def process_rtf(file_handle, field_filter_mode=None):
845 845 CSV_SMALL_THRESH = 1024
846 846  
847 847 # format of dde link: program-name | arguments ! unimportant
848   -DDE_FORMAT = re.compile(r'\s*=(.+)\|(.+)!(.*)\s*')
  848 +CSV_DDE_FORMAT = re.compile(r'\s*=(.+)\|(.+)!(.*)\s*')
  849 +
  850 +# allowed delimiters (python sniffer would use nearly any char). Taken from
  851 +# https://data-gov.tw.rpi.edu/wiki/CSV_files_use_delimiters_other_than_commas
  852 +CSV_DELIMITERS = ',\t ;|^'
849 853  
850 854  
851 855 def process_csv(filepath):
... ... @@ -861,33 +865,35 @@ def process_csv(filepath):
861 865 """
862 866  
863 867 results = []
864   - with open(filepath, 'rb') as file_handle:
865   - results, dialect = process_csv_dialect(file_handle)
  868 + with open(filepath, 'r') as file_handle:
  869 + results, dialect = process_csv_dialect(file_handle, CSV_DELIMITERS)
866 870 is_small = file_handle.tell() < CSV_SMALL_THRESH
867 871  
868 872 if is_small and not results:
869   - # easy to mis-sniff small files. Try different delimiter
870   - log.debug('small file, try second dialect')
  873 + # easy to mis-sniff small files. Try different delimiters
  874 + log.debug('small file, no results; try all delimiters')
871 875 file_handle.seek(0)
872   - other_delim = ',\t ;|^'.replace(dialect.delimiter, '')
873   - try:
874   - results, _ = process_csv_dialect(file_handle, other_delim)
875   - except csv.Error: # e.g. sniffing fails
876   - log.debug('failed to csv-parse with different dialect',
877   - exc_info=True)
  876 + other_delim = CSV_DELIMITERS.replace(dialect.delimiter, '')
  877 + for delim in other_delim:
  878 + try:
  879 + file_handle.seek(0)
  880 + results, _ = process_csv_dialect(file_handle, delim)
  881 + except csv.Error: # e.g. sniffing fails
  882 + log.debug('failed to csv-parse with delimiter {0!r}'
  883 + .format(delim))
878 884  
879 885 if is_small and not results:
880   - # try whole file as single cell
881   - log.debug('try third time, taking whole file as single cell')
  886 + # try whole file as single cell, since sniffing fails in this case
  887 + log.debug('last attempt: take whole file as single unquoted cell')
882 888 file_handle.seek(0)
883   - match = DDE_FORMAT.match(file_handle.read(CSV_SMALL_THRESH))
  889 + match = CSV_DDE_FORMAT.match(file_handle.read(CSV_SMALL_THRESH))
884 890 if match:
885 891 results.append(u' '.join(match.groups()[:2]))
886 892  
887 893 return u'\n'.join(results)
888 894  
889 895  
890   -def process_csv_dialect(file_handle, delimiters=None):
  896 +def process_csv_dialect(file_handle, delimiters):
891 897 """ helper for process_csv: process with a specific csv dialect """
892 898  
893 899 # determine dialect = delimiter chars, quote chars, ...
... ... @@ -907,7 +913,7 @@ def process_csv_dialect(file_handle, delimiters=None):
907 913 for row in reader:
908 914 for cell in row:
909 915 # check if cell matches
910   - match = DDE_FORMAT.match(cell)
  916 + match = CSV_DDE_FORMAT.match(cell)
911 917 if match:
912 918 results.append(u' '.join(match.groups()[:2]))
913 919 return results, dialect
... ...