Commit 7367f14d authored by sim's avatar sim

[FIX] Use pandas for much better CSV handling

parent 7c7c1054
from ._Parser import Parser
# from ..NgramsExtractors import *
import sys
import csv
csv.field_size_limit(sys.maxsize)
import numpy as np
import pandas
import io
class CSVParser(Parser):
DELIMITERS = ", \t;|:"
ENCODING = "utf-8"
def detect_delimiter(self, lines, sample_size=10):
sample = lines[:sample_size]
def open(self, file):
f = super(CSVParser, self).open(file)
# Compute frequency of each delimiter on each input line
delimiters_freqs = {
d: [line.count(d) for line in sample]
for d in self.DELIMITERS
}
if isinstance(file, str) and file.endswith('.zip'):
return f
# Select delimiters with a standard deviation of zero, ie. delimiters
# for which we have the same number of fields on each line
selected_delimiters = [
(d, np.sum(freqs))
for d, freqs in delimiters_freqs.items()
if any(freqs) and np.std(freqs) == 0
]
return io.TextIOWrapper(f, encoding=self.ENCODING)
if selected_delimiters:
# Choose the delimiter with highest frequency amongst selected ones
sorted_delimiters = sorted(selected_delimiters, key=lambda x: x[1])
return sorted_delimiters[-1][0]
def parse(self, filebuf):
print("CSV: parsing (assuming UTF-8 and LF line endings)")
contents = filebuf.read().decode("UTF-8").split("\n")
# Filter out empty lines
contents = [line for line in contents if line.strip()]
# Delimiter auto-detection
delimiter = self.detect_delimiter(contents, sample_size=10)
if delimiter is None:
raise ValueError("CSV: couldn't detect delimiter, bug or malformed data")
print("CSV: selected delimiter: %r" % delimiter)
# Parse CSV
reader = csv.reader(contents, delimiter=delimiter)
# Get first not empty row and its fields (ie. header row), or (0, [])
first_row, headers = \
next(((i, fields) for i, fields in enumerate(reader) if any(fields)),
(0, []))
# Get first not empty column of the first row, or 0
first_col = next((i for i, field in enumerate(headers) if field), 0)
# Strip out potential empty fields in headers
headers = headers[first_col:]
def parse(self, fp=None):
fp = fp or self._file
df = pandas.read_csv(fp, dtype=object, skip_blank_lines=True)
# Return a generator of dictionaries with column labels as keys,
# filtering out empty rows
for i, fields in enumerate(reader):
for i, fields in enumerate(df.itertuples(index=False)):
if i % 500 == 0:
print("CSV: parsing row #%s..." % (i+1))
if any(fields):
yield dict(zip(headers, fields[first_col:]))
# See https://docs.python.org/3/library/collections.html#collections.somenamedtuple._asdict
yield fields._asdict()
......@@ -14,15 +14,14 @@ class Parser:
"""
def __init__(self, file):
if isinstance(file, str):
self._file = open(file, 'rb')
else:
self._file = file
self._file = self.open(file)
def __del__(self):
if hasattr(self, '_file'):
self._file.close()
def open(self, file):
return open(file, 'rb') if isinstance(file, str) else file
def detect_encoding(self, string):
"""Useful method to detect the encoding of a document.
......@@ -165,9 +164,9 @@ class Parser:
file = self._file
# if the file is a ZIP archive, recurse on each of its files...
if zipfile.is_zipfile(file):
with zipfile.ZipFile(file) as zipArchive:
for filename in zipArchive.namelist():
with zipArchive.open(filename) as f:
with zipfile.ZipFile(file) as zf:
for filename in zf.namelist():
with zf.open(filename) as df, self.open(df) as f:
yield from self.__iter__(f)
# ...otherwise, let's parse it directly!
else:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment