Commit 5fe1f07084f2f6e192899884c6345b2131efd2ff
1 parent
fe87211b
msodde: limit csv delimiters to reasonable set
The python sniffer would find "i" as delimiter in text or "<" in xml. We prefer an error over misinterpretation. Also, try all delimiters, not just a second one. Rename one constant (added CSV_)
Showing
1 changed file
with
22 additions
and
16 deletions
oletools/msodde.py
| ... | ... | @@ -845,7 +845,11 @@ def process_rtf(file_handle, field_filter_mode=None): |
| 845 | 845 | CSV_SMALL_THRESH = 1024 |
| 846 | 846 | |
| 847 | 847 | # format of dde link: program-name | arguments ! unimportant |
| 848 | -DDE_FORMAT = re.compile(r'\s*=(.+)\|(.+)!(.*)\s*') | |
| 848 | +CSV_DDE_FORMAT = re.compile(r'\s*=(.+)\|(.+)!(.*)\s*') | |
| 849 | + | |
| 850 | +# allowed delimiters (python sniffer would use nearly any char). Taken from | |
| 851 | +# https://data-gov.tw.rpi.edu/wiki/CSV_files_use_delimiters_other_than_commas | |
| 852 | +CSV_DELIMITERS = ',\t ;|^' | |
| 849 | 853 | |
| 850 | 854 | |
| 851 | 855 | def process_csv(filepath): |
| ... | ... | @@ -861,33 +865,35 @@ def process_csv(filepath): |
| 861 | 865 | """ |
| 862 | 866 | |
| 863 | 867 | results = [] |
| 864 | - with open(filepath, 'rb') as file_handle: | |
| 865 | - results, dialect = process_csv_dialect(file_handle) | |
| 868 | + with open(filepath, 'r') as file_handle: | |
| 869 | + results, dialect = process_csv_dialect(file_handle, CSV_DELIMITERS) | |
| 866 | 870 | is_small = file_handle.tell() < CSV_SMALL_THRESH |
| 867 | 871 | |
| 868 | 872 | if is_small and not results: |
| 869 | - # easy to mis-sniff small files. Try different delimiter | |
| 870 | - log.debug('small file, try second dialect') | |
| 873 | + # easy to mis-sniff small files. Try different delimiters | |
| 874 | + log.debug('small file, no results; try all delimiters') | |
| 871 | 875 | file_handle.seek(0) |
| 872 | - other_delim = ',\t ;|^'.replace(dialect.delimiter, '') | |
| 873 | - try: | |
| 874 | - results, _ = process_csv_dialect(file_handle, other_delim) | |
| 875 | - except csv.Error: # e.g. sniffing fails | |
| 876 | - log.debug('failed to csv-parse with different dialect', | |
| 877 | - exc_info=True) | |
| 876 | + other_delim = CSV_DELIMITERS.replace(dialect.delimiter, '') | |
| 877 | + for delim in other_delim: | |
| 878 | + try: | |
| 879 | + file_handle.seek(0) | |
| 880 | + results, _ = process_csv_dialect(file_handle, delim) | |
| 881 | + except csv.Error: # e.g. sniffing fails | |
| 882 | + log.debug('failed to csv-parse with delimiter {0!r}' | |
| 883 | + .format(delim)) | |
| 878 | 884 | |
| 879 | 885 | if is_small and not results: |
| 880 | - # try whole file as single cell | |
| 881 | - log.debug('try third time, taking whole file as single cell') | |
| 886 | + # try whole file as single cell, since sniffing fails in this case | |
| 887 | + log.debug('last attempt: take whole file as single unquoted cell') | |
| 882 | 888 | file_handle.seek(0) |
| 883 | - match = DDE_FORMAT.match(file_handle.read(CSV_SMALL_THRESH)) | |
| 889 | + match = CSV_DDE_FORMAT.match(file_handle.read(CSV_SMALL_THRESH)) | |
| 884 | 890 | if match: |
| 885 | 891 | results.append(u' '.join(match.groups()[:2])) |
| 886 | 892 | |
| 887 | 893 | return u'\n'.join(results) |
| 888 | 894 | |
| 889 | 895 | |
| 890 | -def process_csv_dialect(file_handle, delimiters=None): | |
| 896 | +def process_csv_dialect(file_handle, delimiters): | |
| 891 | 897 | """ helper for process_csv: process with a specific csv dialect """ |
| 892 | 898 | |
| 893 | 899 | # determine dialect = delimiter chars, quote chars, ... |
| ... | ... | @@ -907,7 +913,7 @@ def process_csv_dialect(file_handle, delimiters=None): |
| 907 | 913 | for row in reader: |
| 908 | 914 | for cell in row: |
| 909 | 915 | # check if cell matches |
| 910 | - match = DDE_FORMAT.match(cell) | |
| 916 | + match = CSV_DDE_FORMAT.match(cell) | |
| 911 | 917 | if match: |
| 912 | 918 | results.append(u' '.join(match.groups()[:2])) |
| 913 | 919 | return results, dialect | ... | ... |