Commit 53f4d9b9 authored by delanoe's avatar delanoe

[FIX] merge url conflicts.

parents 95bbf414 ba2517a4
// dot ngram_parsing_flow.dot -Tpng -o ngram_parsing_flow.png
digraph ngramflow {
edge [fontsize=10] ;
label=<<B><U>gargantext.util.toolchain</U></B><BR/>(ngram extraction flow)>;
labelloc="t" ;
"extracted_ngrams" -> "grouplist" ;
"extracted_ngrams" -> "occs+tfidfs" ;
"main_user_stoplist" -> "stoplist" ;
"stoplist" -> "mainlist" ;
"occs+tfidfs" -> "mainlist" [label=" TFIDF_LIMIT"];
"mainlist" -> "coocs" [label=" COOCS_THRESHOLD"] ;
"coocs" -> "specificity" ;
"specificity" -> "maplist" [label="MAPLIST_LIMIT\nMONOGRAM_PART"];
"maplist" -> "explore" ;
"grouplist" -> "maplist" ;
}
......@@ -156,10 +156,10 @@ RESOURCETYPES = [
'parser': RISParser,
'default_language': 'en',
},
# { 'name': 'CSV',
# # 'parser': CSVParser,
# 'default_language': 'en',
# },
{ 'name': 'CSV',
'parser': CSVParser,
'default_language': 'en',
},
# { 'name': 'ISTex',
# # 'parser': ISTexParser,
# 'default_language': 'en',
......
......@@ -35,8 +35,7 @@ Double = DOUBLE_PRECISION
# useful for queries
from sqlalchemy.orm import aliased
from sqlalchemy import func
from sqlalchemy import func, desc
# bulk insertions
......
......@@ -8,34 +8,32 @@ import os
class CSVParser(Parser):
def CSVsample( self, filename , delim) :
ifile = open( filename, "r" )
reader = csv.reader(ifile, delimiter=delim)
def CSVsample( self, small_contents , delim) :
reader = csv.reader(small_contents, delimiter=delim)
Freqs = []
for row in reader:
Freqs.append(len(row))
ifile.close()
return Freqs
def parse(self, filename):
print("CSV: parsing (assuming UTF-8 and LF line endings)")
contents = filename.read().decode("UTF-8").split("\n")
sample_size = 10
sample_file = filename.replace(".csv","_sample.csv")
sample_contents = contents[0:sample_size]
hyperdata_list = []
command_for_sample = "cat '"+filename+"' | head -n "+str(sample_size)+" > '"+sample_file+"'"
os.system(command_for_sample) # you just created a *_sample.csv
# # = = = = [ Getting delimiters frequency ] = = = = #
PossibleDelimiters = [ ',',' ','\t', ';', '|', ':' ]
AllDelimiters = {}
for delim in PossibleDelimiters:
AllDelimiters[delim] = self.CSVsample( sample_file , delim )
AllDelimiters[delim] = self.CSVsample( sample_contents , delim )
# # = = = = [ / Getting delimiters frequency ] = = = = #
# # OUTPUT example:
# # AllDelimiters = {
......@@ -59,8 +57,8 @@ class CSVParser(Parser):
# # = = = = [ / Stand.Dev=0 & Sum of delimiters ] = = = = #
# # OUTPUT example:
# # Delimiters = [
# # ['\t', 5, 5, 0.0],
# # [',', 75, 5, 0.0],
# # ['\t', 5, 5, 0.0],
# # [',', 75, 5, 0.0],
# # ['|', 5, 5, 0.0]
# # ]
......@@ -68,23 +66,22 @@ class CSVParser(Parser):
# # = = = = [ Delimiter selection ] = = = = #
Sorted_Delims = sorted(Delimiters, key=lambda x: x[1], reverse=True)
HighestDelim = Sorted_Delims[0][0]
# print("selected delimiter:",[HighestDelim]
# print
# HighestDelim = ","
print("CSV selected delimiter:",[HighestDelim])
# # = = = = [ / Delimiter selection ] = = = = #
# # = = = = [ First data coordinate ] = = = = #
Coords = {
"row": -1,
"column": -1
}
ifile = open( sample_file, "r" )
reader = csv.reader(ifile, delimiter=HighestDelim)
reader = csv.reader(contents, delimiter=HighestDelim)
for rownum, tokens in enumerate(reader):
if rownum % 250 == 0:
print("CSV row: ", rownum)
joined_tokens = "".join (tokens)
if Coords["row"]<0 and len( joined_tokens )>0 :
Coords["row"] = rownum
......@@ -93,22 +90,21 @@ class CSVParser(Parser):
if len(t)>0:
Coords["column"] = columnum
break
ifile.close()
# # = = = = [ / First data coordinate ] = = = = #
# # = = = = [ Setting Headers ] = = = = #
Headers_Int2Str = {}
ifile = open( sample_file, "r" )
reader = csv.reader(ifile, delimiter=HighestDelim)
reader = csv.reader(contents, delimiter=HighestDelim)
for rownum, tokens in enumerate(reader):
if rownum>=Coords["row"]:
for columnum in range( Coords["column"],len(tokens) ):
t = tokens[columnum]
Headers_Int2Str[columnum] = t
break
ifile.close()
# print("Headers_Int2Str")
# print(Headers_Int2Str)
# # = = = = [ / Setting Headers ] = = = = #
# # OUTPUT example:
# # Headers_Int2Str = {
......@@ -119,11 +115,9 @@ class CSVParser(Parser):
# # }
# # = = = = [ Reading the whole CSV and saving ] = = = = #
hyperdata_list = []
ifile = open( filename, "r" )
reader = csv.reader(ifile, delimiter=HighestDelim)
reader = csv.reader(contents, delimiter=HighestDelim)
for rownum, tokens in enumerate(reader):
if rownum>Coords["row"]:
RecordDict = {}
......@@ -131,7 +125,6 @@ class CSVParser(Parser):
data = tokens[columnum]
RecordDict[ Headers_Int2Str[columnum] ] = data
hyperdata_list.append( RecordDict )
ifile.close()
# # = = = = [ / Reading the whole CSV and saving ] = = = = #
return hyperdata_list
......@@ -8,4 +8,4 @@ from .Pubmed import PubmedParser
from .Europress import EuropressParser
# from .ISTex import ISTexParser
# from .CSV import CSVParser
from .CSV import CSVParser
This diff is collapsed.
......@@ -11,17 +11,18 @@ urlpatterns = [
url(r'^nodes/(\d+)/facets$' , nodes.CorpusFacet.as_view()),
url(r'^nodes/(\d+)/having$' , nodes.NodeListHaving.as_view()),
# get a list of ngram_ids or ngram_infos by list_id
#
# url(r'^ngramlists/(\d+)$', ngramlists.List.as_view()),
# add or remove ngram from a list
# ex: add <=> PUT ngramlists/change?list=42&ngrams=1,2
# rm <=> DEL ngramlists/change?list=42&ngrams=1,2
url(r'^ngramlists/change$', ngramlists.ListChange.as_view()),
# entire combination of lists from a corpus
# get entire combination of lists from a corpus
# (or any combination of lists that go together :
# - a mainlist
# - an optional stoplist
# - an optional maplist
# - an optional grouplist
# aka lexical model
url(r'^ngramlists/family$' , ngramlists.ListFamily.as_view()),
# - an optional grouplist)
url(r'^ngramlists/family$', ngramlists.ListFamily.as_view()),
]
......@@ -22,7 +22,7 @@
<div class="row">
<div id="monthly-move-chart">
<center>
Select a time range in the chart with blue bars to zoom in
Select a score/frequency range in the chart with blue bars to zoom in
<p align="center">
<a class="btn btn-xs btn-default" role="button" href="/chart/corpus/{{ corpus.id }}/data.csv">Save</a>
<a class="btn btn-xs btn-default" href="javascript:volumeChart.filterAll();dc.redrawAll();">Reset</a></p>
......@@ -41,7 +41,12 @@
<br>
</div>
<input type="hidden" id="list_id" value="{{ list_id }}"></input>
<!-- (values set by js) caching our DB ids (handy for list update commands) -->
<input type="hidden" id="mainlist_id" value=""></input>
<input type="hidden" id="maplist_id" value=""></input>
<input type="hidden" id="stoplist_id" value=""></input>
<input type="hidden" id="groups_id" value=""></input>
<input type="hidden" id="scores_id" value=""></input>
<div class="row">
<div class="panel panel-default">
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment