Commit 5fe1f07084f2f6e192899884c6345b2131efd2ff
1 parent
fe87211b
msodde: limit csv delimiters to reasonable set
The python sniffer would find "i" as delimiter in text or "<" in xml. We prefer an error over misinterpretation. Also, try all delimiters, not just a second one. Rename one constant (added CSV_)
Showing
1 changed file
with
22 additions
and
16 deletions
oletools/msodde.py
| @@ -845,7 +845,11 @@ def process_rtf(file_handle, field_filter_mode=None): | @@ -845,7 +845,11 @@ def process_rtf(file_handle, field_filter_mode=None): | ||
| 845 | CSV_SMALL_THRESH = 1024 | 845 | CSV_SMALL_THRESH = 1024 |
| 846 | 846 | ||
| 847 | # format of dde link: program-name | arguments ! unimportant | 847 | # format of dde link: program-name | arguments ! unimportant |
| 848 | -DDE_FORMAT = re.compile(r'\s*=(.+)\|(.+)!(.*)\s*') | 848 | +CSV_DDE_FORMAT = re.compile(r'\s*=(.+)\|(.+)!(.*)\s*') |
| 849 | + | ||
| 850 | +# allowed delimiters (python sniffer would use nearly any char). Taken from | ||
| 851 | +# https://data-gov.tw.rpi.edu/wiki/CSV_files_use_delimiters_other_than_commas | ||
| 852 | +CSV_DELIMITERS = ',\t ;|^' | ||
| 849 | 853 | ||
| 850 | 854 | ||
| 851 | def process_csv(filepath): | 855 | def process_csv(filepath): |
| @@ -861,33 +865,35 @@ def process_csv(filepath): | @@ -861,33 +865,35 @@ def process_csv(filepath): | ||
| 861 | """ | 865 | """ |
| 862 | 866 | ||
| 863 | results = [] | 867 | results = [] |
| 864 | - with open(filepath, 'rb') as file_handle: | ||
| 865 | - results, dialect = process_csv_dialect(file_handle) | 868 | + with open(filepath, 'r') as file_handle: |
| 869 | + results, dialect = process_csv_dialect(file_handle, CSV_DELIMITERS) | ||
| 866 | is_small = file_handle.tell() < CSV_SMALL_THRESH | 870 | is_small = file_handle.tell() < CSV_SMALL_THRESH |
| 867 | 871 | ||
| 868 | if is_small and not results: | 872 | if is_small and not results: |
| 869 | - # easy to mis-sniff small files. Try different delimiter | ||
| 870 | - log.debug('small file, try second dialect') | 873 | + # easy to mis-sniff small files. Try different delimiters |
| 874 | + log.debug('small file, no results; try all delimiters') | ||
| 871 | file_handle.seek(0) | 875 | file_handle.seek(0) |
| 872 | - other_delim = ',\t ;|^'.replace(dialect.delimiter, '') | ||
| 873 | - try: | ||
| 874 | - results, _ = process_csv_dialect(file_handle, other_delim) | ||
| 875 | - except csv.Error: # e.g. sniffing fails | ||
| 876 | - log.debug('failed to csv-parse with different dialect', | ||
| 877 | - exc_info=True) | 876 | + other_delim = CSV_DELIMITERS.replace(dialect.delimiter, '') |
| 877 | + for delim in other_delim: | ||
| 878 | + try: | ||
| 879 | + file_handle.seek(0) | ||
| 880 | + results, _ = process_csv_dialect(file_handle, delim) | ||
| 881 | + except csv.Error: # e.g. sniffing fails | ||
| 882 | + log.debug('failed to csv-parse with delimiter {0!r}' | ||
| 883 | + .format(delim)) | ||
| 878 | 884 | ||
| 879 | if is_small and not results: | 885 | if is_small and not results: |
| 880 | - # try whole file as single cell | ||
| 881 | - log.debug('try third time, taking whole file as single cell') | 886 | + # try whole file as single cell, since sniffing fails in this case |
| 887 | + log.debug('last attempt: take whole file as single unquoted cell') | ||
| 882 | file_handle.seek(0) | 888 | file_handle.seek(0) |
| 883 | - match = DDE_FORMAT.match(file_handle.read(CSV_SMALL_THRESH)) | 889 | + match = CSV_DDE_FORMAT.match(file_handle.read(CSV_SMALL_THRESH)) |
| 884 | if match: | 890 | if match: |
| 885 | results.append(u' '.join(match.groups()[:2])) | 891 | results.append(u' '.join(match.groups()[:2])) |
| 886 | 892 | ||
| 887 | return u'\n'.join(results) | 893 | return u'\n'.join(results) |
| 888 | 894 | ||
| 889 | 895 | ||
| 890 | -def process_csv_dialect(file_handle, delimiters=None): | 896 | +def process_csv_dialect(file_handle, delimiters): |
| 891 | """ helper for process_csv: process with a specific csv dialect """ | 897 | """ helper for process_csv: process with a specific csv dialect """ |
| 892 | 898 | ||
| 893 | # determine dialect = delimiter chars, quote chars, ... | 899 | # determine dialect = delimiter chars, quote chars, ... |
| @@ -907,7 +913,7 @@ def process_csv_dialect(file_handle, delimiters=None): | @@ -907,7 +913,7 @@ def process_csv_dialect(file_handle, delimiters=None): | ||
| 907 | for row in reader: | 913 | for row in reader: |
| 908 | for cell in row: | 914 | for cell in row: |
| 909 | # check if cell matches | 915 | # check if cell matches |
| 910 | - match = DDE_FORMAT.match(cell) | 916 | + match = CSV_DDE_FORMAT.match(cell) |
| 911 | if match: | 917 | if match: |
| 912 | results.append(u' '.join(match.groups()[:2])) | 918 | results.append(u' '.join(match.groups()[:2])) |
| 913 | return results, dialect | 919 | return results, dialect |