Commit c7526ad7 authored by delanoe's avatar delanoe

[FIX] merge url conflicts.

parents 8be7e5a7 a77ea0cf
// dot ngram_parsing_flow.dot -Tpng -o ngram_parsing_flow.png
digraph ngramflow {
edge [fontsize=10] ;
label=<<B><U>gargantext.util.toolchain</U></B><BR/>(ngram extraction flow)>;
labelloc="t" ;
"extracted_ngrams" -> "grouplist" ;
"extracted_ngrams" -> "occs+tfidfs" ;
"main_user_stoplist" -> "stoplist" ;
"stoplist" -> "mainlist" ;
"occs+tfidfs" -> "mainlist" [label=" TFIDF_LIMIT"];
"mainlist" -> "coocs" [label=" COOCS_THRESHOLD"] ;
"coocs" -> "specificity" ;
"specificity" -> "maplist" [label="MAPLIST_LIMIT\nMONOGRAM_PART"];
"maplist" -> "explore" ;
"grouplist" -> "maplist" ;
}
...@@ -156,10 +156,10 @@ RESOURCETYPES = [ ...@@ -156,10 +156,10 @@ RESOURCETYPES = [
'parser': RISParser, 'parser': RISParser,
'default_language': 'en', 'default_language': 'en',
}, },
# { 'name': 'CSV', { 'name': 'CSV',
# # 'parser': CSVParser, 'parser': CSVParser,
# 'default_language': 'en', 'default_language': 'en',
# }, },
# { 'name': 'ISTex', # { 'name': 'ISTex',
# # 'parser': ISTexParser, # # 'parser': ISTexParser,
# 'default_language': 'en', # 'default_language': 'en',
......
...@@ -35,8 +35,7 @@ Double = DOUBLE_PRECISION ...@@ -35,8 +35,7 @@ Double = DOUBLE_PRECISION
# useful for queries # useful for queries
from sqlalchemy.orm import aliased from sqlalchemy.orm import aliased
from sqlalchemy import func from sqlalchemy import func, desc
# bulk insertions # bulk insertions
......
...@@ -8,34 +8,32 @@ import os ...@@ -8,34 +8,32 @@ import os
class CSVParser(Parser): class CSVParser(Parser):
def CSVsample( self, filename , delim) : def CSVsample( self, small_contents , delim) :
ifile = open( filename, "r" ) reader = csv.reader(small_contents, delimiter=delim)
reader = csv.reader(ifile, delimiter=delim)
Freqs = [] Freqs = []
for row in reader: for row in reader:
Freqs.append(len(row)) Freqs.append(len(row))
ifile.close()
return Freqs return Freqs
def parse(self, filename): def parse(self, filename):
print("CSV: parsing (assuming UTF-8 and LF line endings)")
contents = filename.read().decode("UTF-8").split("\n")
sample_size = 10 sample_size = 10
sample_file = filename.replace(".csv","_sample.csv") sample_contents = contents[0:sample_size]
hyperdata_list = [] hyperdata_list = []
command_for_sample = "cat '"+filename+"' | head -n "+str(sample_size)+" > '"+sample_file+"'"
os.system(command_for_sample) # you just created a *_sample.csv
# # = = = = [ Getting delimiters frequency ] = = = = # # # = = = = [ Getting delimiters frequency ] = = = = #
PossibleDelimiters = [ ',',' ','\t', ';', '|', ':' ] PossibleDelimiters = [ ',',' ','\t', ';', '|', ':' ]
AllDelimiters = {} AllDelimiters = {}
for delim in PossibleDelimiters: for delim in PossibleDelimiters:
AllDelimiters[delim] = self.CSVsample( sample_file , delim ) AllDelimiters[delim] = self.CSVsample( sample_contents , delim )
# # = = = = [ / Getting delimiters frequency ] = = = = # # # = = = = [ / Getting delimiters frequency ] = = = = #
# # OUTPUT example: # # OUTPUT example:
# # AllDelimiters = { # # AllDelimiters = {
...@@ -59,8 +57,8 @@ class CSVParser(Parser): ...@@ -59,8 +57,8 @@ class CSVParser(Parser):
# # = = = = [ / Stand.Dev=0 & Sum of delimiters ] = = = = # # # = = = = [ / Stand.Dev=0 & Sum of delimiters ] = = = = #
# # OUTPUT example: # # OUTPUT example:
# # Delimiters = [ # # Delimiters = [
# # ['\t', 5, 5, 0.0], # # ['\t', 5, 5, 0.0],
# # [',', 75, 5, 0.0], # # [',', 75, 5, 0.0],
# # ['|', 5, 5, 0.0] # # ['|', 5, 5, 0.0]
# # ] # # ]
...@@ -68,23 +66,22 @@ class CSVParser(Parser): ...@@ -68,23 +66,22 @@ class CSVParser(Parser):
# # = = = = [ Delimiter selection ] = = = = # # # = = = = [ Delimiter selection ] = = = = #
Sorted_Delims = sorted(Delimiters, key=lambda x: x[1], reverse=True) Sorted_Delims = sorted(Delimiters, key=lambda x: x[1], reverse=True)
HighestDelim = Sorted_Delims[0][0] HighestDelim = Sorted_Delims[0][0]
# print("selected delimiter:",[HighestDelim] # HighestDelim = ","
# print print("CSV selected delimiter:",[HighestDelim])
# # = = = = [ / Delimiter selection ] = = = = # # # = = = = [ / Delimiter selection ] = = = = #
# # = = = = [ First data coordinate ] = = = = # # # = = = = [ First data coordinate ] = = = = #
Coords = { Coords = {
"row": -1, "row": -1,
"column": -1 "column": -1
} }
ifile = open( sample_file, "r" ) reader = csv.reader(contents, delimiter=HighestDelim)
reader = csv.reader(ifile, delimiter=HighestDelim)
for rownum, tokens in enumerate(reader): for rownum, tokens in enumerate(reader):
if rownum % 250 == 0:
print("CSV row: ", rownum)
joined_tokens = "".join (tokens) joined_tokens = "".join (tokens)
if Coords["row"]<0 and len( joined_tokens )>0 : if Coords["row"]<0 and len( joined_tokens )>0 :
Coords["row"] = rownum Coords["row"] = rownum
...@@ -93,22 +90,21 @@ class CSVParser(Parser): ...@@ -93,22 +90,21 @@ class CSVParser(Parser):
if len(t)>0: if len(t)>0:
Coords["column"] = columnum Coords["column"] = columnum
break break
ifile.close()
# # = = = = [ / First data coordinate ] = = = = # # # = = = = [ / First data coordinate ] = = = = #
# # = = = = [ Setting Headers ] = = = = # # # = = = = [ Setting Headers ] = = = = #
Headers_Int2Str = {} Headers_Int2Str = {}
ifile = open( sample_file, "r" ) reader = csv.reader(contents, delimiter=HighestDelim)
reader = csv.reader(ifile, delimiter=HighestDelim)
for rownum, tokens in enumerate(reader): for rownum, tokens in enumerate(reader):
if rownum>=Coords["row"]: if rownum>=Coords["row"]:
for columnum in range( Coords["column"],len(tokens) ): for columnum in range( Coords["column"],len(tokens) ):
t = tokens[columnum] t = tokens[columnum]
Headers_Int2Str[columnum] = t Headers_Int2Str[columnum] = t
break break
ifile.close() # print("Headers_Int2Str")
# print(Headers_Int2Str)
# # = = = = [ / Setting Headers ] = = = = # # # = = = = [ / Setting Headers ] = = = = #
# # OUTPUT example: # # OUTPUT example:
# # Headers_Int2Str = { # # Headers_Int2Str = {
...@@ -119,11 +115,9 @@ class CSVParser(Parser): ...@@ -119,11 +115,9 @@ class CSVParser(Parser):
# # } # # }
# # = = = = [ Reading the whole CSV and saving ] = = = = # # # = = = = [ Reading the whole CSV and saving ] = = = = #
hyperdata_list = [] hyperdata_list = []
ifile = open( filename, "r" ) reader = csv.reader(contents, delimiter=HighestDelim)
reader = csv.reader(ifile, delimiter=HighestDelim)
for rownum, tokens in enumerate(reader): for rownum, tokens in enumerate(reader):
if rownum>Coords["row"]: if rownum>Coords["row"]:
RecordDict = {} RecordDict = {}
...@@ -131,7 +125,6 @@ class CSVParser(Parser): ...@@ -131,7 +125,6 @@ class CSVParser(Parser):
data = tokens[columnum] data = tokens[columnum]
RecordDict[ Headers_Int2Str[columnum] ] = data RecordDict[ Headers_Int2Str[columnum] ] = data
hyperdata_list.append( RecordDict ) hyperdata_list.append( RecordDict )
ifile.close()
# # = = = = [ / Reading the whole CSV and saving ] = = = = # # # = = = = [ / Reading the whole CSV and saving ] = = = = #
return hyperdata_list return hyperdata_list
...@@ -8,4 +8,4 @@ from .Pubmed import PubmedParser ...@@ -8,4 +8,4 @@ from .Pubmed import PubmedParser
from .Europress import EuropressParser from .Europress import EuropressParser
# from .ISTex import ISTexParser # from .ISTex import ISTexParser
# from .CSV import CSVParser from .CSV import CSVParser
This diff is collapsed.
...@@ -11,17 +11,18 @@ urlpatterns = [ ...@@ -11,17 +11,18 @@ urlpatterns = [
url(r'^nodes/(\d+)/facets$' , nodes.CorpusFacet.as_view()), url(r'^nodes/(\d+)/facets$' , nodes.CorpusFacet.as_view()),
url(r'^nodes/(\d+)/having$' , nodes.NodeListHaving.as_view()), url(r'^nodes/(\d+)/having$' , nodes.NodeListHaving.as_view()),
# get a list of ngram_ids or ngram_infos by list_id # add or remove ngram from a list
# # ex: add <=> PUT ngramlists/change?list=42&ngrams=1,2
# url(r'^ngramlists/(\d+)$', ngramlists.List.as_view()), # rm <=> DEL ngramlists/change?list=42&ngrams=1,2
url(r'^ngramlists/change$', ngramlists.ListChange.as_view()),
# entire combination of lists from a corpus # get entire combination of lists from a corpus
# (or any combination of lists that go together : # (or any combination of lists that go together :
# - a mainlist # - a mainlist
# - an optional stoplist # - an optional stoplist
# - an optional maplist # - an optional maplist
# - an optional grouplist # - an optional grouplist)
# aka lexical model url(r'^ngramlists/family$', ngramlists.ListFamily.as_view()),
url(r'^ngramlists/family$' , ngramlists.ListFamily.as_view()),
] ]
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
<div class="row"> <div class="row">
<div id="monthly-move-chart"> <div id="monthly-move-chart">
<center> <center>
Select a time range in the chart with blue bars to zoom in Select a score/frequency range in the chart with blue bars to zoom in
<p align="center"> <p align="center">
<a class="btn btn-xs btn-default" role="button" href="/chart/corpus/{{ corpus.id }}/data.csv">Save</a> <a class="btn btn-xs btn-default" role="button" href="/chart/corpus/{{ corpus.id }}/data.csv">Save</a>
<a class="btn btn-xs btn-default" href="javascript:volumeChart.filterAll();dc.redrawAll();">Reset</a></p> <a class="btn btn-xs btn-default" href="javascript:volumeChart.filterAll();dc.redrawAll();">Reset</a></p>
...@@ -41,7 +41,12 @@ ...@@ -41,7 +41,12 @@
<br> <br>
</div> </div>
<input type="hidden" id="list_id" value="{{ list_id }}"></input> <!-- (values set by js) caching our DB ids (handy for list update commands) -->
<input type="hidden" id="mainlist_id" value=""></input>
<input type="hidden" id="maplist_id" value=""></input>
<input type="hidden" id="stoplist_id" value=""></input>
<input type="hidden" id="groups_id" value=""></input>
<input type="hidden" id="scores_id" value=""></input>
<div class="row"> <div class="row">
<div class="panel panel-default"> <div class="panel panel-default">
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment