Commit f61283c9 authored by sim's avatar sim

[REFACT] Add method for CSV delimiter detection

parent be116069
...@@ -7,29 +7,15 @@ import numpy as np ...@@ -7,29 +7,15 @@ import numpy as np
class CSVParser(Parser): class CSVParser(Parser):
DELIMITERS = ", \t;|:"
def parse(self, filebuf): def detect_delimiter(self, lines, sample_size=10):
sample = lines[:sample_size]
print("CSV: parsing (assuming UTF-8 and LF line endings)")
contents = filebuf.read().decode("UTF-8").split("\n")
# Filter out empty lines
contents = [line for line in contents if line.strip()]
sample_size = 10
sample_contents = contents[0:sample_size]
delimiters = ", \t;|:"
#==========================#
# DELIMITER AUTO-DETECTION #
#==========================#
# Compute frequency of each delimiter on each input line # Compute frequency of each delimiter on each input line
delimiters_freqs = { delimiters_freqs = {
d: [line.count(d) for line in sample_contents] d: [line.count(d) for line in sample]
for d in delimiters for d in self.DELIMITERS
} }
# Select delimiters with a standard deviation of zero, ie. delimiters # Select delimiters with a standard deviation of zero, ie. delimiters
...@@ -43,17 +29,26 @@ class CSVParser(Parser): ...@@ -43,17 +29,26 @@ class CSVParser(Parser):
if selected_delimiters: if selected_delimiters:
# Choose the delimiter with highest frequency amongst selected ones # Choose the delimiter with highest frequency amongst selected ones
sorted_delimiters = sorted(selected_delimiters, key=lambda x: x[1]) sorted_delimiters = sorted(selected_delimiters, key=lambda x: x[1])
best_delimiter = sorted_delimiters[-1][0] return sorted_delimiters[-1][0]
else:
def parse(self, filebuf):
print("CSV: parsing (assuming UTF-8 and LF line endings)")
contents = filebuf.read().decode("UTF-8").split("\n")
# Filter out empty lines
contents = [line for line in contents if line.strip()]
# Delimiter auto-detection
delimiter = self.detect_delimiter(contents, sample_size=10)
if delimiter is None:
raise ValueError("CSV: couldn't detect delimiter, bug or malformed data") raise ValueError("CSV: couldn't detect delimiter, bug or malformed data")
print("CSV: selected delimiter: %r" % delimiter) print("CSV: selected delimiter: %r" % delimiter)
#=================# # Parse CSV
# DATA PROCESSING # reader = csv.reader(contents, delimiter=delimiter)
#=================#
reader = csv.reader(contents, delimiter=best_delimiter)
# Get first not empty row and its fields (ie. header row), or (0, []) # Get first not empty row and its fields (ie. header row), or (0, [])
first_row, headers = \ first_row, headers = \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment