Commit b20b2dae authored by delanoe's avatar delanoe

[FEAT] CSV Parser By Samuel.

parent 1d0cbe6f
from django.db import transaction
from lxml import etree
from .FileParser import FileParser
from ..NgramsExtractors import *
from datetime import datetime
from io import BytesIO
import csv
import sys
csv.field_size_limit(sys.maxsize)
import numpy as np
import os
class CSVParser(FileParser):
def CSVsample( self, filename , delim) :
ifile = open( filename, "r" )
reader = csv.reader(ifile, delimiter=delim)
Freqs = []
for row in reader:
Freqs.append(len(row))
ifile.close()
return Freqs
def _parse(self, file):
def _parse(self, filename):
sample_size = 10
sample_file = filename.replace(".csv","_sample.csv")
hyperdata_list = []
print(hyperdata_list)
print(file)
import csv
f = open(file , "r")
reader = csv.reader(f , delimiter='\t')
# print(hyperdata_list)
print(filename)
command_for_sample = 'cat '+filename+' | head -n '+str(sample_size)+' > '+sample_file
os.system(command_for_sample) # you just created a *_sample.csv
# # = = = = [ Getting delimiters frequency ] = = = = #
PossibleDelimiters = [ ',',' ','\t', ';', '|', ':' ]
AllDelimiters = {}
for delim in PossibleDelimiters:
AllDelimiters[delim] = self.CSVsample( sample_file , delim )
# # = = = = [ / Getting delimiters frequency ] = = = = #
# # OUTPUT example:
# # AllDelimiters = {
# # '\t': [1, 1, 1, 1, 1],
# # ' ': [1, 13, 261, 348, 330],
# # ',': [15, 15, 15, 15, 15],
# # ';': [1, 1, 1, 1, 1],
# # '|': [1, 1, 1, 1, 1]
# # }
# # = = = = [ Stand.Dev=0 & Sum of delimiters ] = = = = #
Delimiters = []
for d in AllDelimiters:
freqs = AllDelimiters[d]
suma = np.sum( freqs )
if suma >0:
std = np.std( freqs )
# print [ d , suma , len(freqs) , std]
if std == 0:
Delimiters.append ( [ d , suma , len(freqs) , std] )
# # = = = = [ / Stand.Dev=0 & Sum of delimiters ] = = = = #
# # OUTPUT example:
# # Delimiters = [
# # ['\t', 5, 5, 0.0],
# # [',', 75, 5, 0.0],
# # ['|', 5, 5, 0.0]
# # ]
# # = = = = [ Delimiter selection ] = = = = #
Sorted_Delims = sorted(Delimiters, key=lambda x: x[1], reverse=True)
HighestDelim = Sorted_Delims[0][0]
# print("selected delimiter:",[HighestDelim]
# print
# # = = = = [ / Delimiter selection ] = = = = #
# # = = = = [ First data coordinate ] = = = = #
Coords = {
"row": -1,
"column": -1
}
ifile = open( sample_file, "r" )
reader = csv.reader(ifile, delimiter=HighestDelim)
for rownum, tokens in enumerate(reader):
joined_tokens = "".join (tokens)
if Coords["row"]<0 and len( joined_tokens )>0 :
Coords["row"] = rownum
for columnum in range(len(tokens)):
t = tokens[columnum]
if len(t)>0:
Coords["column"] = columnum
break
ifile.close()
# # = = = = [ / First data coordinate ] = = = = #
# # = = = = [ Setting Headers ] = = = = #
Headers_Int2Str = {}
ifile = open( sample_file, "r" )
reader = csv.reader(ifile, delimiter=HighestDelim)
for rownum, tokens in enumerate(reader):
if rownum>=Coords["row"]:
for columnum in range( Coords["column"],len(tokens) ):
t = tokens[columnum]
Headers_Int2Str[columnum] = t
break
ifile.close()
# # = = = = [ / Setting Headers ] = = = = #
# # OUTPUT example:
# # Headers_Int2Str = {
# # 0: 'publication_date',
# # 1: 'publication_month',
# # 2: 'publication_second',
# # 3: 'abstract'
# # }
# # = = = = [ Reading the whole CSV and saving ] = = = = #
hyperdata_list = []
ifile = open( filename, "r" )
reader = csv.reader(ifile, delimiter=HighestDelim)
for rownum, tokens in enumerate(reader):
if rownum>Coords["row"]:
RecordDict = {}
for columnum in range( Coords["column"],len(tokens) ):
data = tokens[columnum]
RecordDict[ Headers_Int2Str[columnum] ] = data
hyperdata_list.append( RecordDict )
ifile.close()
# # = = = = [ / Reading the whole CSV and saving ] = = = = #
counter = 0
for row in reader:
if counter >0:
doi = row[0]
# ['ID', 'PI', 'AG1', 'AG2', 'ACR', 'TI', 'ABS']
authors = row[1]
title = row[5]
abstract = row[6]
agency = ""
if row[2]!="": agency = row[2]
else: agency = row[3]
pub = {}
pub["doi"] = doi
pub["title"] = title
pub["journal"] = agency
pub["abstract"] = abstract
pub["publication_year"] = "2014"
pub["publication_month"] = "01"
pub["publication_day"] = "01"
pub["language_iso3"] = "eng"
pub["authors"] = [ authors ]
hyperdata_list.append(pub)
else: counter+=1
f.close()
return hyperdata_list
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment