Commit 5fe1f07084f2f6e192899884c6345b2131efd2ff

Authored by Christian Herdtweck
1 parent fe87211b

msodde: limit csv delimiters to reasonable set

The python sniffer would find "i" as delimiter in text or "<" in xml. We
prefer an error over misinterpretation.

Also, try all delimiters, not just a second one.

Rename one constant (added CSV_)
Showing 1 changed file with 22 additions and 16 deletions
oletools/msodde.py
@@ -845,7 +845,11 @@ def process_rtf(file_handle, field_filter_mode=None): @@ -845,7 +845,11 @@ def process_rtf(file_handle, field_filter_mode=None):
845 CSV_SMALL_THRESH = 1024 845 CSV_SMALL_THRESH = 1024
846 846
847 # format of dde link: program-name | arguments ! unimportant 847 # format of dde link: program-name | arguments ! unimportant
848 -DDE_FORMAT = re.compile(r'\s*=(.+)\|(.+)!(.*)\s*') 848 +CSV_DDE_FORMAT = re.compile(r'\s*=(.+)\|(.+)!(.*)\s*')
  849 +
  850 +# allowed delimiters (python sniffer would use nearly any char). Taken from
  851 +# https://data-gov.tw.rpi.edu/wiki/CSV_files_use_delimiters_other_than_commas
  852 +CSV_DELIMITERS = ',\t ;|^'
849 853
850 854
851 def process_csv(filepath): 855 def process_csv(filepath):
@@ -861,33 +865,35 @@ def process_csv(filepath): @@ -861,33 +865,35 @@ def process_csv(filepath):
861 """ 865 """
862 866
863 results = [] 867 results = []
864 - with open(filepath, 'rb') as file_handle:  
865 - results, dialect = process_csv_dialect(file_handle) 868 + with open(filepath, 'r') as file_handle:
  869 + results, dialect = process_csv_dialect(file_handle, CSV_DELIMITERS)
866 is_small = file_handle.tell() < CSV_SMALL_THRESH 870 is_small = file_handle.tell() < CSV_SMALL_THRESH
867 871
868 if is_small and not results: 872 if is_small and not results:
869 - # easy to mis-sniff small files. Try different delimiter  
870 - log.debug('small file, try second dialect') 873 + # easy to mis-sniff small files. Try different delimiters
  874 + log.debug('small file, no results; try all delimiters')
871 file_handle.seek(0) 875 file_handle.seek(0)
872 - other_delim = ',\t ;|^'.replace(dialect.delimiter, '')  
873 - try:  
874 - results, _ = process_csv_dialect(file_handle, other_delim)  
875 - except csv.Error: # e.g. sniffing fails  
876 - log.debug('failed to csv-parse with different dialect',  
877 - exc_info=True) 876 + other_delim = CSV_DELIMITERS.replace(dialect.delimiter, '')
  877 + for delim in other_delim:
  878 + try:
  879 + file_handle.seek(0)
  880 + results, _ = process_csv_dialect(file_handle, delim)
  881 + except csv.Error: # e.g. sniffing fails
  882 + log.debug('failed to csv-parse with delimiter {0!r}'
  883 + .format(delim))
878 884
879 if is_small and not results: 885 if is_small and not results:
880 - # try whole file as single cell  
881 - log.debug('try third time, taking whole file as single cell') 886 + # try whole file as single cell, since sniffing fails in this case
  887 + log.debug('last attempt: take whole file as single unquoted cell')
882 file_handle.seek(0) 888 file_handle.seek(0)
883 - match = DDE_FORMAT.match(file_handle.read(CSV_SMALL_THRESH)) 889 + match = CSV_DDE_FORMAT.match(file_handle.read(CSV_SMALL_THRESH))
884 if match: 890 if match:
885 results.append(u' '.join(match.groups()[:2])) 891 results.append(u' '.join(match.groups()[:2]))
886 892
887 return u'\n'.join(results) 893 return u'\n'.join(results)
888 894
889 895
890 -def process_csv_dialect(file_handle, delimiters=None): 896 +def process_csv_dialect(file_handle, delimiters):
891 """ helper for process_csv: process with a specific csv dialect """ 897 """ helper for process_csv: process with a specific csv dialect """
892 898
893 # determine dialect = delimiter chars, quote chars, ... 899 # determine dialect = delimiter chars, quote chars, ...
@@ -907,7 +913,7 @@ def process_csv_dialect(file_handle, delimiters=None): @@ -907,7 +913,7 @@ def process_csv_dialect(file_handle, delimiters=None):
907 for row in reader: 913 for row in reader:
908 for cell in row: 914 for cell in row:
909 # check if cell matches 915 # check if cell matches
910 - match = DDE_FORMAT.match(cell) 916 + match = CSV_DDE_FORMAT.match(cell)
911 if match: 917 if match:
912 results.append(u' '.join(match.groups()[:2])) 918 results.append(u' '.join(match.groups()[:2]))
913 return results, dialect 919 return results, dialect