Commit 7367f14d authored by sim's avatar sim

[FIX] Use pandas for much better CSV handling

parent 7c7c1054
from ._Parser import Parser from ._Parser import Parser
# from ..NgramsExtractors import * import pandas
import sys import io
import csv
csv.field_size_limit(sys.maxsize)
import numpy as np
class CSVParser(Parser): class CSVParser(Parser):
DELIMITERS = ", \t;|:" ENCODING = "utf-8"
def detect_delimiter(self, lines, sample_size=10): def open(self, file):
sample = lines[:sample_size] f = super(CSVParser, self).open(file)
# Compute frequency of each delimiter on each input line if isinstance(file, str) and file.endswith('.zip'):
delimiters_freqs = { return f
d: [line.count(d) for line in sample]
for d in self.DELIMITERS
}
# Select delimiters with a standard deviation of zero, ie. delimiters return io.TextIOWrapper(f, encoding=self.ENCODING)
# for which we have the same number of fields on each line
selected_delimiters = [
(d, np.sum(freqs))
for d, freqs in delimiters_freqs.items()
if any(freqs) and np.std(freqs) == 0
]
if selected_delimiters: def parse(self, fp=None):
# Choose the delimiter with highest frequency amongst selected ones fp = fp or self._file
sorted_delimiters = sorted(selected_delimiters, key=lambda x: x[1]) df = pandas.read_csv(fp, dtype=object, skip_blank_lines=True)
return sorted_delimiters[-1][0]
def parse(self, filebuf):
print("CSV: parsing (assuming UTF-8 and LF line endings)")
contents = filebuf.read().decode("UTF-8").split("\n")
# Filter out empty lines
contents = [line for line in contents if line.strip()]
# Delimiter auto-detection
delimiter = self.detect_delimiter(contents, sample_size=10)
if delimiter is None:
raise ValueError("CSV: couldn't detect delimiter, bug or malformed data")
print("CSV: selected delimiter: %r" % delimiter)
# Parse CSV
reader = csv.reader(contents, delimiter=delimiter)
# Get first not empty row and its fields (ie. header row), or (0, [])
first_row, headers = \
next(((i, fields) for i, fields in enumerate(reader) if any(fields)),
(0, []))
# Get first not empty column of the first row, or 0
first_col = next((i for i, field in enumerate(headers) if field), 0)
# Strip out potential empty fields in headers
headers = headers[first_col:]
# Return a generator of dictionaries with column labels as keys, # Return a generator of dictionaries with column labels as keys,
# filtering out empty rows # filtering out empty rows
for i, fields in enumerate(reader): for i, fields in enumerate(df.itertuples(index=False)):
if i % 500 == 0: if i % 500 == 0:
print("CSV: parsing row #%s..." % (i+1)) print("CSV: parsing row #%s..." % (i+1))
if any(fields):
yield dict(zip(headers, fields[first_col:])) # See https://docs.python.org/3/library/collections.html#collections.somenamedtuple._asdict
yield fields._asdict()
...@@ -14,15 +14,14 @@ class Parser: ...@@ -14,15 +14,14 @@ class Parser:
""" """
def __init__(self, file): def __init__(self, file):
if isinstance(file, str): self._file = self.open(file)
self._file = open(file, 'rb')
else:
self._file = file
def __del__(self): def __del__(self):
if hasattr(self, '_file'): if hasattr(self, '_file'):
self._file.close() self._file.close()
def open(self, file):
return open(file, 'rb') if isinstance(file, str) else file
def detect_encoding(self, string): def detect_encoding(self, string):
"""Useful method to detect the encoding of a document. """Useful method to detect the encoding of a document.
...@@ -165,9 +164,9 @@ class Parser: ...@@ -165,9 +164,9 @@ class Parser:
file = self._file file = self._file
# if the file is a ZIP archive, recurse on each of its files... # if the file is a ZIP archive, recurse on each of its files...
if zipfile.is_zipfile(file): if zipfile.is_zipfile(file):
with zipfile.ZipFile(file) as zipArchive: with zipfile.ZipFile(file) as zf:
for filename in zipArchive.namelist(): for filename in zf.namelist():
with zipArchive.open(filename) as f: with zf.open(filename) as df, self.open(df) as f:
yield from self.__iter__(f) yield from self.__iter__(f)
# ...otherwise, let's parse it directly! # ...otherwise, let's parse it directly!
else: else:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment