[FIX] Use pandas for much better CSV handling

7367f14d · sim · 7c7c1054 · 7367f14d · 7367f14d
Commit 7367f14d authored Oct 17, 2017 by sim
Hide whitespace changes
Inline Side-by-side

Showing with 21 additions and 64 deletions

CSV.py gargantext/util/parsers/CSV.py +15 -57

_Parser.py gargantext/util/parsers/_Parser.py +6 -7

No files found.
--- a/gargantext/util/parsers/CSV.py
+++ b/gargantext/util/parsers/CSV.py
 from ._Parser import Parser
-# from ..NgramsExtractors import *
-import sys
-import csv
-csv.field_size_limit(sys.maxsize)
-import numpy as np
+import pandas
+import io


 class CSVParser(Parser):
-    DELIMITERS = ", \t;|:"
+    ENCODING = "utf-8"

-    def detect_delimiter(self, lines, sample_size=10):
-        sample = lines[:sample_size]
+    def open(self, file):
+        f = super(CSVParser, self).open(file)

-        # Compute frequency of each delimiter on each input line
-        delimiters_freqs = {
-            d: [line.count(d) for line in sample]
-            for d in self.DELIMITERS
-        }
+        if isinstance(file, str) and file.endswith('.zip'):
+            return f

-        # Select delimiters with a standard deviation of zero, ie. delimiters
-        # for which we have the same number of fields on each line
-        selected_delimiters = [
-            (d, np.sum(freqs))
-            for d, freqs in delimiters_freqs.items()
-            if any(freqs) and np.std(freqs) == 0
-        ]
+        return io.TextIOWrapper(f, encoding=self.ENCODING)

-        if selected_delimiters:
-            # Choose the delimiter with highest frequency amongst selected ones
-            sorted_delimiters = sorted(selected_delimiters, key=lambda x: x[1])
-            return sorted_delimiters[-1][0]
-
-    def parse(self, filebuf):
-        print("CSV: parsing (assuming UTF-8 and LF line endings)")
-
-        contents = filebuf.read().decode("UTF-8").split("\n")
-
-        # Filter out empty lines
-        contents = [line for line in contents if line.strip()]
-
-        # Delimiter auto-detection
-        delimiter = self.detect_delimiter(contents, sample_size=10)
-
-        if delimiter is None:
-            raise ValueError("CSV: couldn't detect delimiter, bug or malformed data")
-
-        print("CSV: selected delimiter: %r" % delimiter)
-
-        # Parse CSV
-        reader = csv.reader(contents, delimiter=delimiter)
-
-        # Get first not empty row and its fields (ie. header row), or (0, [])
-        first_row, headers = \
-            next(((i, fields) for i, fields in enumerate(reader) if any(fields)),
-                 (0, []))
-
-        # Get first not empty column of the first row, or 0
-        first_col = next((i for i, field in enumerate(headers) if field), 0)
-
-        # Strip out potential empty fields in headers
-        headers = headers[first_col:]
+    def parse(self, fp=None):
+        fp = fp or self._file
+        df = pandas.read_csv(fp, dtype=object, skip_blank_lines=True)

        # Return a generator of dictionaries with column labels as keys,
        # filtering out empty rows
-        for i, fields in enumerate(reader):
+        for i, fields in enumerate(df.itertuples(index=False)):
            if i % 500 == 0:
                print("CSV: parsing row #%s..." % (i+1))
-            if any(fields):
-                yield dict(zip(headers, fields[first_col:]))
+
+            # See https://docs.python.org/3/library/collections.html#collections.somenamedtuple._asdict
+            yield fields._asdict()
--- a/gargantext/util/parsers/_Parser.py
+++ b/gargantext/util/parsers/_Parser.py
@@ -14,15 +14,14 @@ class Parser:
    """

    def __init__(self, file):
-        if isinstance(file, str):
-            self._file = open(file, 'rb')
-        else:
-            self._file = file
+        self._file = self.open(file)

    def __del__(self):
        if hasattr(self, '_file'):
            self._file.close()

+    def open(self, file):
+        return open(file, 'rb') if isinstance(file, str) else file

    def detect_encoding(self, string):
        """Useful method to detect the encoding of a document.
@@ -165,9 +164,9 @@ class Parser:
            file = self._file
        # if the file is a ZIP archive, recurse on each of its files...
        if zipfile.is_zipfile(file):
-            with zipfile.ZipFile(file) as zipArchive:
-                for filename in zipArchive.namelist():
-                    with zipArchive.open(filename) as f:
+            with zipfile.ZipFile(file) as zf:
+                for filename in zf.namelist():
+                    with zf.open(filename) as df, self.open(df) as f:
                        yield from self.__iter__(f)
        # ...otherwise, let's parse it directly!
        else: