[REFACT] Add method for CSV delimiter detection

f61283c9 · sim · be116069 · f61283c9
Commit f61283c9 authored Aug 17, 2017 by sim
Hide whitespace changes
Inline Side-by-side

Showing with 21 additions and 26 deletions

CSV.py gargantext/util/parsers/CSV.py +21 -26

No files found.
--- a/gargantext/util/parsers/CSV.py
+++ b/gargantext/util/parsers/CSV.py
@@ -7,29 +7,15 @@ import numpy as np
 class CSVParser(Parser):
+    DELIMITERS = ", \t;|:"
-    def parse(self, filebuf):
+    def detect_delimiter(self, lines, sample_size=10):
+        sample = lines[:sample_size]
-        print("CSV: parsing (assuming UTF-8 and LF line endings)")
-        contents = filebuf.read().decode("UTF-8").split("\n")
-        # Filter out empty lines
-        contents = [line for line in contents if line.strip()]
-        sample_size = 10
-        sample_contents = contents[0:sample_size]
-        delimiters = ", \t;|:"
-        #==========================#
-        # DELIMITER AUTO-DETECTION #
-        #==========================#
        # Compute frequency of each delimiter on each input line
        delimiters_freqs = {
-            d: [line.count(d) for line in sample_contents]
+            d: [line.count(d) for line in sample]
-            for d in delimiters
+            for d in self.DELIMITERS
        }
        # Select delimiters with a standard deviation of zero, ie. delimiters
@@ -43,17 +29,26 @@ class CSVParser(Parser):
        if selected_delimiters:
            # Choose the delimiter with highest frequency amongst selected ones
            sorted_delimiters = sorted(selected_delimiters, key=lambda x: x[1])
-            best_delimiter = sorted_delimiters[-1][0]
+            return sorted_delimiters[-1][0]
-        else:
+    def parse(self, filebuf):
+        print("CSV: parsing (assuming UTF-8 and LF line endings)")
+        contents = filebuf.read().decode("UTF-8").split("\n")
+        # Filter out empty lines
+        contents = [line for line in contents if line.strip()]
+        # Delimiter auto-detection
+        delimiter = self.detect_delimiter(contents, sample_size=10)
+        if delimiter is None:
            raise ValueError("CSV: couldn't detect delimiter, bug or malformed data")
        print("CSV: selected delimiter: %r" % delimiter)
-        #=================#
+        # Parse CSV
-        # DATA PROCESSING #
+        reader = csv.reader(contents, delimiter=delimiter)
-        #=================#
-        reader = csv.reader(contents, delimiter=best_delimiter)
        # Get first not empty row and its fields (ie. header row), or (0, [])
        first_row, headers = \