Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
3a445e73
Commit
3a445e73
authored
Jun 18, 2016
by
delanoe
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'romain-goodies' into unstable
parents
ebe22cf3
0ca0bf13
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
1160 additions
and
119 deletions
+1160
-119
db.py
gargantext/util/db.py
+25
-2
group_tools.py
gargantext/util/group_tools.py
+187
-0
ngramlists_tools.py
gargantext/util/ngramlists_tools.py
+724
-0
ngrams_extraction.py
gargantext/util/toolchain/ngrams_extraction.py
+3
-0
ngramlists.py
gargantext/views/api/ngramlists.py
+65
-95
urls.py
gargantext/views/api/urls.py
+9
-0
terms.py
gargantext/views/pages/terms.py
+4
-1
requirements.txt
install/python/requirements.txt
+1
-0
menu.css
static/lib/gargantext/menu.css
+10
-0
terms.html
templates/pages/corpora/terms.html
+105
-11
menu.html
templates/pages/menu.html
+27
-10
No files found.
gargantext/util/db.py
View file @
3a445e73
...
...
@@ -86,7 +86,19 @@ class bulk_insert:
readline
=
read
def
bulk_insert_ifnotexists
(
model
,
uniquekey
,
fields
,
data
,
cursor
=
None
):
def
bulk_insert_ifnotexists
(
model
,
uniquekey
,
fields
,
data
,
cursor
=
None
,
do_stats
=
False
):
"""
Inserts bulk data with an intermediate check on a uniquekey
(ex: Ngram.terms) to see if the row existed before.
If the row already existed we just retrieve its id.
If it didn't exist we create it and retrieve the id.
Returns a dict {uniquekey => id}
Option:
do stats: also returns the number of those that had no previous id
"""
if
cursor
is
None
:
db
,
cursor
=
get_cursor
()
mustcommit
=
True
...
...
@@ -109,6 +121,7 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None):
sourcetable
=
model
.
__tablename__
,
uniquecolumn
=
uniquekey
,
))
# insert what has not been found to the real table
cursor
.
execute
(
'''
INSERT INTO {sourcetable} ({columns})
...
...
@@ -119,6 +132,11 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None):
sourcetable
=
model
.
__tablename__
,
columns
=
', '
.
join
(
fields
),
))
if
do_stats
:
# remember how many rows we inserted just now
n_new
=
cursor
.
rowcount
# retrieve dict associating unique key to id
cursor
.
execute
(
'''
SELECT source.id, source.{uniquecolumn}
...
...
@@ -130,10 +148,15 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None):
columns
=
', '
.
join
(
fields
),
))
result
=
{
# term : new_id
row
[
1
]:
row
[
0
]
for
row
in
cursor
.
fetchall
()
}
# this is the end!
cursor
.
execute
(
'DROP TABLE __tmp__'
)
if
mustcommit
:
db
.
commit
()
return
result
if
do_stats
:
return
result
,
n_new
else
:
return
result
gargantext/util/group_tools.py
0 → 100644
View file @
3a445e73
"""
Utilities for group management
- query_grouped_ngrams(group_id) to retrieve subforms
- group_union() to join two groupings lists
"""
from
gargantext.util.db
import
session
,
aliased
from
gargantext.models
import
Ngram
,
NodeNgramNgram
from
igraph
import
Graph
# for group_union
def
query_groups
(
groupings_id
,
details
=
False
):
"""
Listing of couples (mainform, subform)
aka (ngram1_id, ngram2_id)
Parameter:
- details: if False, just send the array of couples
if True, send quadruplets with (ngram1_id, term1, ngram2_id, term2)
"""
if
not
details
:
# simple contents
query
=
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
)
else
:
# detailed contents (id + terms)
Ngram1
=
aliased
(
Ngram
)
Ngram2
=
aliased
(
Ngram
)
query
=
(
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
Ngram1
.
terms
,
NodeNgramNgram
.
ngram2_id
,
Ngram2
.
terms
,
)
.
join
(
Ngram1
,
NodeNgramNgram
.
ngram1_id
==
Ngram1
.
id
)
.
join
(
Ngram2
,
NodeNgramNgram
.
ngram2_id
==
Ngram2
.
id
)
)
# main filter
# -----------
query
=
query
.
filter
(
NodeNgramNgram
.
node_id
==
groupings_id
)
return
query
def
query_grouped_ngrams
(
groupings_id
,
details
=
False
,
scoring_metric_id
=
None
):
"""
Listing of "hidden" ngram_ids from the groups
Works only for grouplists
Parameter:
- details: if False, send just the array of ngram_ids
if True, send triples with (ngram_id, term, scoring)
^^^^^^^
deprecated: scoring_metric_id: id of a scoring metric node (TFIDF or OCCS)
(for details and sorting)
(no more OCCS counts of subforms)
"""
if
not
details
:
# simple contents
query
=
session
.
query
(
NodeNgramNgram
.
ngram2_id
)
else
:
# detailed contents (terms and some NodeNodeNgram for score)
query
=
(
session
.
query
(
NodeNgramNgram
.
ngram2_id
,
Ngram
.
terms
,
# NodeNodeNgram.score #
)
.
join
(
Ngram
,
NodeNgramNgram
.
ngram2_id
==
Ngram
.
id
)
# .join(NodeNodeNgram, NodeNgramNgram.ngram2_id == NodeNodeNgram.ngram_id)
# .filter(NodeNodeNgram.node1_id == scoring_metric_id)
# .order_by(desc(NodeNodeNgram.score))
)
# main filter
# -----------
query
=
query
.
filter
(
NodeNgramNgram
.
node_id
==
groupings_id
)
return
query
def
group_union
(
g_a_links
,
g_b_links
):
"""
Synonym groups are modelled by sets of couples in the DB
Input : 2 arrays of links (ngramx_id, ngramy_id)
Input : 1 array of links (ngramx_id, ngramy_id)
Synonymity is considered transitive so in effect the groups
can form a set (defined by the connected component of couples).
A requested feature is also that one node dominates others
(aka "leader effect"; leader will be in the map, the others won't)
Summary of major union effects in various cases:
GROUP 1 Group 2 Group 1 ∪ 2
A -> B A -> C A -> B (simple union)
A -> C
D -> E E -> F D -> E
D -> F (D "leader effect")
G -> H G -> I G -> H ( transitivity +
H -> J G -> I "leader effect")
G -> J
rloth: this is some slightly amended code
from Samuel's in rest_v1_0.ngrams.Group.get
TODO use "most frequent" score if leader candidates are ex aequo by degree.
"""
# output: list of links forming new group
new_links
=
[]
# 1) create graph with both lists
# -------------------------------
# from igraph import Graph
# the set of all our ngram_ids
all_vertices
=
set
(
[
ngid
for
couple
in
g_a_links
+
g_b_links
for
ngid
in
couple
]
)
# initialize the synonym graph with size
sg
=
Graph
(
len
(
all_vertices
),
directed
=
True
)
# add our IDs as "name" (special attribute good for edge creation)
sg
.
vs
[
'name'
]
=
[
str
(
x
)
for
x
in
all_vertices
]
# add the edges as named couples
sg
.
add_edges
([(
str
(
x
),
str
(
y
))
for
(
x
,
y
)
in
g_a_links
])
#print('UNION A:', g_a_links)
#print('initially %i components' % len(sg.as_undirected().components()))
# same with the other edges
sg
.
add_edges
([(
str
(
x
),
str
(
y
))
for
(
x
,
y
)
in
g_b_links
])
#print('UNION B:', g_b_links)
#print('after union %i components' % len(sg.as_undirected().components()))
# 2) list resulting components
# -----------------------------
synonym_components
=
sg
.
as_undirected
()
.
components
()
# for example
# cs = [[0, 3, 6], [1, 2, 8], [4, 5, 9, 11], [7,10]]
# there should be no singletons by construction
# list of all outdegrees for "leader" detection
# (leader = term most often marked as source by the users)
odegs
=
sg
.
outdegree
()
#for i, v in enumerate(sg.vs):
# print("%i - name:%s - odeg:%i" % (i, v['name'], odegs[i]))
for
component
in
synonym_components
:
# we map back to our ids, preserving order
our_comp
=
[
int
(
our_id
)
for
our_id
in
sg
.
vs
[
component
][
'name'
]]
# 3) take main node and unnest into new links list
# -------------------------------------------------
# position (within this component) of the best node (by degree)
max_odeg
=
-
1
main_node_local_index
=
None
for
position
,
vertex_id
in
enumerate
(
component
):
this_odeg
=
odegs
[
vertex_id
]
if
this_odeg
>
max_odeg
:
main_node_local_index
=
position
max_odeg
=
this_odeg
# we set it aside in our translated version our_comp
main_node
=
our_comp
.
pop
(
main_node_local_index
)
# and unnest the others
for
remaining_id
in
our_comp
:
new_links
.
append
((
main_node
,
remaining_id
))
return
new_links
gargantext/util/ngramlists_tools.py
0 → 100644
View file @
3a445e73
"""
Tools to work with ngramlists (MAINLIST, MAPLIST, STOPLIST)
- query_list(list_id) to retrieve ngrams
- export_ngramlists(corpus_node)
- import_ngramlists(corpus_node)
- merge_ngramlists(new_lists, onto_corpus = corpus_node)
"""
from
gargantext.util.group_tools
import
query_groups
,
group_union
from
gargantext.util.db
import
session
,
desc
,
func
,
\
bulk_insert_ifnotexists
from
gargantext.models
import
Ngram
,
NodeNgram
,
NodeNodeNgram
,
\
NodeNgramNgram
from
gargantext.util.lists
import
UnweightedList
,
Translations
# import will implement the same text cleaning procedures as toolchain
from
gargantext.util.toolchain.parsing
import
normalize_chars
from
gargantext.util.toolchain.ngrams_extraction
import
normalize_terms
from
sqlalchemy.sql
import
exists
from
os
import
path
from
csv
import
writer
,
reader
,
QUOTE_MINIMAL
from
collections
import
defaultdict
from
re
import
match
from
io
import
StringIO
# pseudo file to write CSV to memory
def
query_list
(
list_id
,
pagination_limit
=
None
,
pagination_offset
=
None
,
details
=
False
,
scoring_metric_id
=
None
,
groupings_id
=
None
):
"""
Paginated listing of ngram_ids in a NodeNgram lists.
Works for a mainlist or stoplist or maplist (not grouplists!)
Parameter:
- pagination_limit, pagination_offset
- details: if False, send just the array of ngram_ids
if True and no scoring, send couples with (ngram_id, term)
if True and a scoring_id, send triples with (ngram_id, term, scoring)
- scoring_metric_id: id of a scoring metric node (TFIDF or OCCS)
(for details and sorting)
- groupings_id: optional id of a list of grouping relations (synonyms)
(each synonym will be added to the list if not already in there)
FIXME: subforms appended recently and not generalized enough
=> add a common part for all "if groupings_id"
=> provide the option also in combination with scoring
"""
# simple contents
if
not
details
:
query
=
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
list_id
)
if
groupings_id
:
subforms
=
(
session
.
query
(
NodeNgramNgram
.
ngram2_id
)
# subform ids...
.
filter
(
NodeNgramNgram
.
node_id
==
groupings_id
)
# .. that are connected to a mainform
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
NodeNgramNgram
.
ngram1_id
)
# .. which is in the list
.
filter
(
NodeNgram
.
node_id
==
list_id
)
)
# union with the main q
query
=
query
.
union
(
subforms
)
# detailed contents (id + terms)
elif
not
scoring_metric_id
:
query
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
,
Ngram
.
n
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
filter
(
NodeNgram
.
node_id
==
list_id
)
)
if
groupings_id
:
subforms
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
,
Ngram
.
n
)
.
join
(
NodeNgramNgram
,
NodeNgramNgram
.
ngram2_id
==
Ngram
.
id
)
# subform ids...
.
filter
(
NodeNgramNgram
.
node_id
==
groupings_id
)
# .. that are connected to a mainform
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
NodeNgramNgram
.
ngram1_id
)
# .. which is in the list
.
filter
(
NodeNgram
.
node_id
==
list_id
)
)
# union with the main q
query
=
query
.
union
(
subforms
)
# detailed contents (id + terms) + score
else
:
# NB: score can be undefined (eg ex-subform that now became free)
# ==> we need outerjoin
# and the filter needs to have scoring_metric_id so we do it before
ScoresTable
=
(
session
.
query
(
NodeNodeNgram
.
score
,
NodeNodeNgram
.
ngram_id
)
.
filter
(
NodeNodeNgram
.
node1_id
==
scoring_metric_id
)
.
subquery
()
)
query
=
(
session
.
query
(
NodeNgram
.
ngram_id
,
Ngram
.
terms
,
ScoresTable
.
c
.
score
)
.
join
(
Ngram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
# main filter ----------------------
.
filter
(
NodeNgram
.
node_id
==
list_id
)
# scores if possible
.
outerjoin
(
ScoresTable
,
ScoresTable
.
c
.
ngram_id
==
NodeNgram
.
ngram_id
)
.
order_by
(
desc
(
ScoresTable
.
c
.
score
))
)
if
pagination_limit
:
query
=
query
.
limit
(
pagination_limit
)
if
pagination_offset
:
query
=
query
.
offset
(
pagination_offsets
)
return
query
# helper func for exports
def
ngrams_to_csv_rows
(
ngram_objs
,
id_groupings
=
{},
list_type
=
""
):
"""
@param: ngram_objs
an array of ngrams (eg: from a db query.all())
@param: optional id_groupings
a dict of sets {mainform_id : {subform_idA, subform_idB, etc}}
@param: list_type (a str 'map','main' or 'stop' to fill in col 4)
Outputs a basic info table per ngram
(ng_id, term string, term size, list_type)
with an optional 5th column of grouped subforms ex: "4|42"
Returns format is a csv_rows matrix (as a list of lists)
[
[ligne1_colA, ligne1_colB..],
[ligne2_colA, ligne2_colB..],
..
]
(to be used for instance like: csv.writer.writerows(csv_rows)
list_type ici:
0 <=> stopList
1 <=> miamList
2 <=> mapList
"""
# transcrire les objets ngrammes en tableau (liste de listes)
csv_rows
=
list
()
for
ng_obj
in
ngram_objs
:
ng_id
=
ng_obj
.
id
if
ng_id
in
id_groupings
.
keys
():
this_grouped
=
"|"
.
join
(
str
(
gid
)
for
gid
in
id_groupings
[
ng_id
])
else
:
this_grouped
=
""
# transcription : 5 columns
# ID , terme , n , type_de_liste , grouped_id|grouped_id...
csv_rows
.
append
(
[
ng_id
,
ng_obj
.
terms
,
ng_obj
.
n
,
list_type
,
this_grouped
]
)
return
csv_rows
def
export_ngramlists
(
node
,
fname
=
None
,
delimiter
=
"
\t
"
,
titles
=
False
):
"""
export of the 3 lists under a corpus node (MAP, MAIN, STOP)
with local combination of groups
@param node: the corpus node
@param fname: optional filename to write the CSV
(if absent, returns a str with CSV contents)
@param delimiter: optional column separator in the CSV
(if absent defaults to tabulation)
@param titles: optional flag to print or not a first line with headers
# ID , term , nwords , list_type , grouped_id|grouped_id...
1622 textile 1 main 1623|3397
3397 textile production 2 main
3410 possibility 1 stop
TODO : REFACTOR split list logic from corpus logic
=> possibility to act on one list
"""
# the node arg has to be a corpus here
if
not
hasattr
(
node
,
"typename"
)
or
node
.
typename
!=
"CORPUS"
:
raise
TypeError
(
"EXPORT: node argument must be a Corpus Node"
)
# les nodes couvrant les listes
# -----------------------------
stoplist_node
=
node
.
children
(
"STOPLIST"
)
.
first
()
mainlist_node
=
node
.
children
(
"MAINLIST"
)
.
first
()
maplist_node
=
node
.
children
(
"MAPLIST"
)
.
first
()
# et les groupes de synonymes
group_node
=
node
.
children
(
"GROUPLIST"
)
.
first
()
# listes de ngram_ids correspondantes
# ------------------------------------
# contenu: liste des objets ngrammes [(2562,"monterme",1),...]
stop_ngrams
=
query_list
(
stoplist_node
.
id
,
details
=
True
,
groupings_id
=
group_node
.
id
)
.
all
()
main_ngrams
=
query_list
(
mainlist_node
.
id
,
details
=
True
,
groupings_id
=
group_node
.
id
)
.
all
()
map_ngrams
=
query_list
(
maplist_node
.
id
,
details
=
True
,
groupings_id
=
group_node
.
id
)
.
all
()
# pour debug ---------->8 --------------------
#~ stop_ngrams = stop_ngrams[0:10]
#~ main_ngrams = main_ngrams[0:10]
#~ map_ngrams = map_ngrams[0:10]
# --------------------->8 --------------------
# pour la group_list on a des couples de ngram_ids
# -------------------
# ex: [(3544, 2353), (2787, 4032), ...]
group_ngram_id_couples
=
query_groups
(
group_node
.
id
)
.
all
()
# k couples comme set
# --------------------
# [(x => y1), (x => y2)] >~~~~~~~> [x => {y1,y2}]
grouped
=
defaultdict
(
set
)
for
ngram
in
group_ngram_id_couples
:
grouped
[
ngram
[
0
]]
.
add
(
ngram
[
1
])
# on applique notre fonction ng_to_csv sur chaque liste
# ------------------------------------------------------
map_csv_rows
=
ngrams_to_csv_rows
(
map_ngrams
,
id_groupings
=
grouped
,
list_type
=
"map"
)
stop_csv_rows
=
ngrams_to_csv_rows
(
stop_ngrams
,
id_groupings
=
grouped
,
list_type
=
"stop"
)
# miam contient map donc il y a un préalable ici
map_ngram_ids
=
{
ng
.
id
for
ng
in
map_ngrams
}
main_without_map
=
[
ng
for
ng
in
main_ngrams
if
ng
.
id
not
in
map_ngram_ids
]
miam_csv_rows
=
ngrams_to_csv_rows
(
main_without_map
,
id_groupings
=
grouped
,
list_type
=
"main"
)
# all lists together now
this_corpus_all_rows
=
map_csv_rows
+
miam_csv_rows
+
stop_csv_rows
# choice of output: file or string
if
fname
==
None
:
out_file
=
StringIO
()
elif
type
(
fname
)
==
str
:
out_file
=
open
(
fname
,
'w'
)
else
:
straight_to_handle
=
True
out_file
=
fname
# csv.writer()
csv_wr
=
writer
(
out_file
,
delimiter
=
delimiter
,
quoting
=
QUOTE_MINIMAL
)
if
titles
:
csv_wr
.
writerow
([
"oldid"
,
"term"
,
"nwords"
,
"listtype"
,
"subforms"
])
# write to outfile
csv_wr
.
writerows
(
this_corpus_all_rows
)
if
fname
==
None
:
# return output as a string
print
(
"EXPORT: wrote
%
i ngrams to CSV string"
%
len
(
this_corpus_all_rows
))
return
out_file
.
getvalue
()
elif
straight_to_handle
:
print
(
"EXPORT: wrote
%
i ngrams to CSV response handle"
%
len
(
this_corpus_all_rows
))
else
:
# just close output file
out_file
.
close
()
print
(
"EXPORT: wrote
%
i ngrams to CSV file '
%
s'"
%
(
len
(
this_corpus_all_rows
),
path
.
abspath
(
fname
)))
def
import_ngramlists
(
fname
,
delimiter
=
'
\t
'
,
group_delimiter
=
'|'
):
'''
This function reads a CSV of an ngrams table for a Corpus,
then it converts old ngram_ids to those of the current DB
(and adds to DB any unknown ngrams)
then recreates an equivalent set of MAINLIST, MAPLIST, STOPLIST + GROUPS
Input example:
oldid | term |nwords| ltype |group_oldids
-------+---------------+------+--------+---------------
3842 water table 2 map 3724
3724 water tables 2 map
4277 water supply 2 map 190362|13415
13415 water supplies 2 map
190362 water-supply 1 map
20489 wastewater 1 map
Output: 3 x UnweightedList + 1 x Translations
@param fname a local filename or a filehandle-like
@param delimiter a character used as separator in the CSV
@param group_delimiter a character used as grouped subforms separator
(in the last column)
The conversion of old_id to ngram_id works in 2 steps:
=> look up each term str in the DB with bulk_insert_ifnotexists
(creates absent ngrams if necessary)
=> use the new ids to map the relations involving the old ones
NB: the creation of MAINLIST also adds all elements from the MAPLIST
NB: To merge the imported lists into a corpus node's lists,
chain this function with merge_ngramlists()
'''
# --------------
#
# --------------
# main storage for the ngrams by list
import_nodes_ngrams
=
{
'stop'
:[],
'main'
:[],
'map'
:[]}
# separate storage for the term's couples [(term str, nwords int),...]
imported_ngrams_dbdata
=
[]
# and all the old ids, by term (for id lookup after dbdata bulk_insert)
imported_ngrams_oldids
=
{}
# and for the imported_grouping list of couples [(x1,y1),(x1,y2),(x2,y3),..]
imported_groupings
=
[]
# /!\ imported_grouping contains only external ids (aka oldids)
# (ie imported ids.. that will have to be translated
# to target db ids)
# skipped lines can (very rarely) be used in groups => mark as ignored
ignored_oldids
=
[]
# =============== READ CSV ===============
if
isinstance
(
fname
,
str
):
fh
=
open
(
fname
,
"r"
)
elif
callable
(
getattr
(
fname
,
"read"
,
None
)):
fh
=
fname
else
:
raise
TypeError
(
"IMPORT: fname argument has unknown type
%
s"
%
type
(
fh
))
# reading all directly b/c csv.reader takes only lines or a real fh in bytes
# and we usually have a "false" fh (uploadedfile.InMemoryUploadedFile) in strings
# (but we checked its size before!)
contents
=
fh
.
read
()
.
decode
(
"UTF-8"
)
.
split
(
"
\n
"
)
# end of CSV read
fh
.
close
()
# <class 'django.core.files.uploadedfile.InMemoryUploadedFile'>
ngrams_csv_rows
=
reader
(
contents
,
delimiter
=
delimiter
,
quoting
=
QUOTE_MINIMAL
)
# for stats
n_read_lines
=
0
n_total_ng
=
0
n_added_ng
=
0
n_group_relations
=
0
# load CSV + initial checks
for
i
,
csv_row
in
enumerate
(
ngrams_csv_rows
):
# fyi
n_read_lines
+=
1
# print("---------------READ LINE %i" % i)
if
not
len
(
csv_row
):
continue
try
:
this_ng_oldid
=
str
(
csv_row
[
0
])
this_ng_term
=
str
(
csv_row
[
1
])
this_ng_nwords
=
int
(
csv_row
[
2
])
this_list_type
=
str
(
csv_row
[
3
])
this_ng_group
=
str
(
csv_row
[
4
])
# string normalizations
this_ng_term
=
normalize_terms
(
normalize_chars
(
this_ng_term
))
except
:
if
i
==
0
:
print
(
"IMPORT WARN: (skip line) probable header line at CSV
%
s:l.0"
%
fname
)
continue
else
:
raise
ValueError
(
"Error on CSV read line
%
i"
%
n_read_lines
)
# --- check format before any old ID retrieve
if
not
match
(
r"\d+$"
,
this_ng_oldid
):
print
(
"IMPORT WARN: (skip line) bad ID at CSV
%
s:l.
%
i"
%
(
fname
,
i
))
continue
else
:
this_ng_oldid
=
int
(
this_ng_oldid
)
# --- term checking
if
not
len
(
this_ng_term
)
>
0
:
print
(
"IMPORT WARN: (skip line) empty term at CSV
%
s:l.
%
i"
%
(
fname
,
i
))
ignored_oldids
.
append
(
this_ng_oldid
)
continue
# --- check if not a duplicate string
if
this_ng_term
in
imported_ngrams_oldids
:
ignored_oldids
.
append
(
this_ng_oldid
)
print
(
"IMPORT WARN: (skip line) term appears more than once (previous id:
%
i) at CSV
%
s:l.
%
i"
%
(
imported_ngrams_oldids
[
this_ng_term
],
fname
,
i
))
continue
# --- check correct list type
if
not
this_list_type
in
[
'stop'
,
'main'
,
'map'
]:
ignored_oldids
.
append
(
this_ng_oldid
)
print
(
"IMPORT WARN: (skip line) wrong list type at CSV
%
s:l.
%
i"
%
(
fname
,
i
))
continue
# ================= Store the data ====================
# the ngram data
imported_ngrams_dbdata
.
append
([
this_ng_term
,
this_ng_nwords
])
imported_ngrams_oldids
[
this_ng_term
]
=
this_ng_oldid
# and the "list to ngram" relation
import_nodes_ngrams
[
this_list_type
]
.
append
(
this_ng_oldid
)
# ====== Store synonyms from the import (if any) ======
if
len
(
this_ng_group
)
!=
0
:
group_as_external_ids
=
this_ng_group
.
split
(
'|'
)
for
external_subform_id
in
group_as_external_ids
:
external_subform_id
=
int
(
external_subform_id
)
imported_groupings
.
append
(
(
this_ng_oldid
,
external_subform_id
)
)
# ======== ngram save + id lookup =========
n_total_ng
=
len
(
imported_ngrams_dbdata
)
# returns a dict {term => id} and a count of inserted ones
(
new_ngrams_ids
,
n_added_ng
)
=
bulk_insert_ifnotexists
(
model
=
Ngram
,
uniquekey
=
'terms'
,
fields
=
(
'terms'
,
'n'
),
data
=
imported_ngrams_dbdata
,
do_stats
=
True
)
del
imported_ngrams_dbdata
# loop on old ngrams and create direct mapping old_id => new_id
old_to_new_id_map
=
{}
for
term
,
oldid
in
imported_ngrams_oldids
.
items
():
old_to_new_id_map
[
oldid
]
=
new_ngrams_ids
[
term
]
del
new_ngrams_ids
del
imported_ngrams_oldids
# print(old_to_new_id_map)
# print(import_nodes_ngrams)
# ======== Import into lists =========
# 3 x abstract lists + 1 translations
result
=
{
'map'
:
UnweightedList
(),
'main'
:
UnweightedList
(),
'stop'
:
UnweightedList
(),
'groupings'
:
Translations
()
}
for
list_type
in
import_nodes_ngrams
:
for
old_id
in
import_nodes_ngrams
[
list_type
]:
new_id
=
old_to_new_id_map
[
old_id
]
# add to the abstract list
result
[
list_type
]
.
items
.
add
(
new_id
)
# for main also add map elements
if
list_type
==
'main'
:
for
old_id
in
import_nodes_ngrams
[
'map'
]:
new_id
=
old_to_new_id_map
[
old_id
]
result
[
'main'
]
.
items
.
add
(
new_id
)
# ======== Synonyms =========
for
(
x
,
y
)
in
imported_groupings
:
if
(
x
not
in
ignored_oldids
)
and
(
y
not
in
ignored_oldids
):
new_mainform_id
=
old_to_new_id_map
[
x
]
new_subform_id
=
old_to_new_id_map
[
y
]
# /!\ Translations use (subform => mainform) order
result
[
'groupings'
]
.
items
[
new_subform_id
]
=
new_mainform_id
n_group_relations
+=
1
# ------------------------------------------------------------------
print
(
"IMPORT: read
%
i lines from the CSV"
%
n_read_lines
)
print
(
"IMPORT: read
%
i terms (
%
i added and
%
i already existing)"
%
(
n_total_ng
,
n_added_ng
,
n_total_ng
-
n_added_ng
)
)
print
(
"IMPORT: read
%
i grouping relations"
%
n_group_relations
)
return
result
def
merge_ngramlists
(
new_lists
=
{},
onto_corpus
=
None
,
del_originals
=
[]):
"""
Integrates an external terms table to the current one:
- merges groups (using group_union() function)
- resolves conflicts if terms belong in different lists
> map wins over both other types
> main wins over stop
> stop never wins
@param new_lists: a dict of *new* imported lists with format:
{'stop': UnweightedList,
'main': UnweightedList,
'map': UnweightedList,
'groupings': Translations }
@param onto_corpus: a corpus node to get the *old* lists
@param del_originals: an array of original wordlists to ignore
and delete during the merge
possible values : ['stop','main','map']
par exemple
del_originals = ['stop','main'] => effacera la stoplist
et la mainlist
mais pas la maplist qui sera fusionnée
(les éléments de la map list
seront remis dans la main à la fin)
NB: Uses group_tools.group_union() to merge the synonym links.
FIXME: new terms created at import_ngramlists() can now be added to lists
but are never added to docs
"""
# log to send back to client-side (lines will be joined)
my_log
=
[]
# the tgt node arg has to be a corpus here
if
not
hasattr
(
onto_corpus
,
"typename"
)
or
onto_corpus
.
typename
!=
"CORPUS"
:
raise
TypeError
(
"IMPORT: 'onto_corpus' argument must be a Corpus Node"
)
# for stats
added_nd_ng
=
0
# number of added list elements
# our list shortcuts will be 0,1,2 (aka lid)
# by order of precedence
linfos
=
[
{
'key'
:
'stop'
,
'name'
:
"STOPLIST"
},
# lid = 0
{
'key'
:
'main'
,
'name'
:
"MAINLIST"
},
# lid = 1
{
'key'
:
'map'
,
'name'
:
"MAPLIST"
}
# lid = 2
]
# ======== Get the old lists =========
old_lists
=
{}
# DB nodes stored with same indices 0,1,2 (resp. stop, miam and map)
# find target ids of the list node objects
tgt_nodeids
=
[
onto_corpus
.
children
(
"STOPLIST"
)
.
first
()
.
id
,
onto_corpus
.
children
(
"MAINLIST"
)
.
first
()
.
id
,
onto_corpus
.
children
(
"MAPLIST"
)
.
first
()
.
id
]
old_group_id
=
onto_corpus
.
children
(
"GROUPLIST"
)
.
first
()
.
id
# retrieve old data into old_lists[list_type]...
# ----------------------------------------------
for
lid
,
linfo
in
enumerate
(
linfos
):
list_type
=
linfo
[
'key'
]
if
list_type
not
in
del_originals
:
# NB can't use UnweightedList(tgt_nodeids[lid])
# because we need to include out-of-list subforms
list_ngrams_q
=
query_list
(
tgt_nodeids
[
lid
],
groupings_id
=
old_group_id
)
old_lists
[
list_type
]
=
UnweightedList
(
list_ngrams_q
.
all
())
else
:
# ...or use empty objects if replacing old list
# ----------------------------------------------
old_lists
[
list_type
]
=
UnweightedList
()
msg
=
"MERGE: ignoring old
%
s which will be overwritten"
%
linfo
[
'name'
]
print
(
msg
)
my_log
.
append
(
msg
)
# ======== Merging all involved ngrams =========
# all memberships with resolved conflicts of interfering memberships
resolved_memberships
=
{}
for
list_set
in
[
old_lists
,
new_lists
]:
for
lid
,
info
in
enumerate
(
linfos
):
list_type
=
info
[
'key'
]
# we use the fact that lids are ordered ints...
for
ng_id
in
list_set
[
list_type
]
.
items
:
if
ng_id
not
in
resolved_memberships
:
resolved_memberships
[
ng_id
]
=
lid
else
:
# ...now resolving is simply taking the max
# stop < main < map
resolved_memberships
[
ng_id
]
=
max
(
lid
,
resolved_memberships
[
ng_id
]
)
# now each ngram is only in its most important list
# -------------------------------------------------
# NB temporarily map items are not in main anymore
# but we'll copy it at the end
# NB temporarily all subforms were treated separately
# from mainforms but we'll force them into same list
# after we merge the groups
del
old_lists
del
new_lists
[
'stop'
]
del
new_lists
[
'main'
]
del
new_lists
[
'map'
]
# ======== Merging old and new groups =========
# get the arcs already in the target DB (directed couples)
previous_links
=
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
old_group_id
)
.
all
()
n_links_previous
=
len
(
previous_links
)
# same format for the new arcs (Translations ~~~> array of couples)
translated_imported_links
=
[]
add_link
=
translated_imported_links
.
append
n_links_added
=
0
for
(
y
,
x
)
in
new_lists
[
'groupings'
]
.
items
.
items
():
add_link
((
x
,
y
))
n_links_added
+=
1
del
new_lists
# group_union: joins 2 different synonym-links lists into 1 new list
new_links
=
group_union
(
previous_links
,
translated_imported_links
)
del
previous_links
del
translated_imported_links
n_links_after
=
len
(
new_links
)
merged_group
=
Translations
([(
y
,
x
)
for
(
x
,
y
)
in
new_links
])
del
new_links
# ======== Overwrite old data with new =========
merged_group
.
save
(
old_group_id
)
msg
=
"MERGE: groupings
%
i updated (links before/added/after:
%
i/
%
i/
%
i)"
%
(
old_group_id
,
n_links_previous
,
n_links_added
,
n_links_after
)
my_log
.
append
(
msg
)
print
(
msg
)
# ======== Target list(s) append data =========
# if list 2 => write in both tgt_data_lists [1,2]
# lists 0 or 1 => straightforward targets [0] or [1]
merged_results
=
{
'stop'
:
UnweightedList
(),
'main'
:
UnweightedList
(),
'map'
:
UnweightedList
()
}
for
(
ng_id
,
winner_lid
)
in
resolved_memberships
.
items
():
## 1) using the new groups
# normal case if not a subform
if
ng_id
not
in
merged_group
.
items
:
target_lid
=
winner_lid
# inherit case if is a subform
else
:
mainform_id
=
merged_group
.
items
[
ng_id
]
# inherited winner
try
:
target_lid
=
resolved_memberships
[
mainform_id
]
except
KeyError
:
target_lid
=
winner_lid
print
(
"MERGE: WARN ng_id
%
i has incorrect mainform
%
i ?"
%
(
ng_id
,
mainform_id
))
## 2) map => map + main
if
target_lid
==
2
:
todo_lids
=
[
1
,
2
]
else
:
todo_lids
=
[
target_lid
]
## 3) storage
for
lid
in
todo_lids
:
list_type
=
linfos
[
lid
][
'key'
]
merged_results
[
list_type
]
.
items
.
add
(
ng_id
)
# print("IMPORT: added %i elements in the lists indices" % added_nd_ng)
# ======== Overwrite old data with new =========
for
lid
,
info
in
enumerate
(
linfos
):
tgt_id
=
tgt_nodeids
[
lid
]
list_type
=
info
[
'key'
]
result
=
merged_results
[
list_type
]
result
.
save
(
tgt_id
)
msg
=
"MERGE:
%
s
%
i updated (new size:
%
i)"
%
(
info
[
'name'
],
tgt_id
,
len
(
merged_results
[
list_type
]
.
items
))
my_log
.
append
(
msg
)
print
(
msg
)
# return a log
return
(
"
\n
"
.
join
(
my_log
))
gargantext/util/toolchain/ngrams_extraction.py
View file @
3a445e73
...
...
@@ -9,6 +9,9 @@ from re import sub
from
gargantext.util.scheduling
import
scheduled
def
_integrate_associations
(
nodes_ngrams_count
,
ngrams_data
,
db
,
cursor
):
"""
@param ngrams_data a set like {('single word', 2), ('apple', 1),...}
"""
print
(
'INTEGRATE'
)
# integrate ngrams
ngrams_ids
=
bulk_insert_ifnotexists
(
...
...
gargantext/views/api/ngramlists.py
View file @
3a445e73
...
...
@@ -8,118 +8,88 @@ API views for advanced operations on ngrams and ngramlists
"""
from
gargantext.util.http
import
APIView
,
get_parameters
,
JsonHttpResponse
,
\
ValidationException
,
Http404
from
gargantext.util.db
import
session
,
aliased
,
desc
,
bulk_insert
ValidationException
,
Http404
,
HttpResponse
from
gargantext.util.db
import
session
,
aliased
,
bulk_insert
from
gargantext.util.db_cache
import
cache
from
sqlalchemy
import
tuple_
from
gargantext.models
import
Ngram
,
NodeNgram
,
NodeNodeNgram
,
NodeNgramNgram
from
gargantext.util.lists
import
UnweightedList
,
Translations
# useful subroutines
from
gargantext.util.ngramlists_tools
import
query_list
,
export_ngramlists
,
\
import_ngramlists
,
merge_ngramlists
from
gargantext.util.group_tools
import
query_grouped_ngrams
def
_query_list
(
list_id
,
pagination_limit
=
None
,
pagination_offset
=
None
,
details
=
False
,
scoring_metric_id
=
None
):
class
List
(
APIView
):
"""
see already available API query api/nodes/<list_id>?fields[]=ngrams
"""
Paginated listing of ngram_ids in a NodeNgram lists.
pass
Works for a mainlist or stoplist or maplist (not grouplists!)
Parameter:
- pagination_limit, pagination_offset
- details: if False, send just the array of ngram_ids
if True, send triples with (ngram_id, term, scoring)
^^^^^^^
- scoring_metric_id: id of a scoring metric node (TFIDF or OCCS)
(for details and sorting)
class
CSVLists
(
APIView
):
"""
if
not
details
:
# simple contents
query
=
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
list_id
)
else
:
# detailed contents (terms and some NodeNodeNgram for score)
# NB: score can be undefined (eg ex-subform that now became free)
# ==> we need outerjoin
# and the filter needs to have scoring_metric_id so we do it before
ScoresTable
=
(
session
.
query
(
NodeNodeNgram
.
score
,
NodeNodeNgram
.
ngram_id
)
.
filter
(
NodeNodeNgram
.
node1_id
==
scoring_metric_id
)
.
subquery
()
)
query
=
(
session
.
query
(
NodeNgram
.
ngram_id
,
Ngram
.
terms
,
ScoresTable
.
c
.
score
)
.
join
(
Ngram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
# main filter ----------------------
.
filter
(
NodeNgram
.
node_id
==
list_id
)
# scores if possible
.
outerjoin
(
ScoresTable
,
ScoresTable
.
c
.
ngram_id
==
NodeNgram
.
ngram_id
)
.
order_by
(
desc
(
ScoresTable
.
c
.
score
))
)
For CSV exports of all lists of a corpus
if
pagination_limit
:
query
=
query
.
limit
(
pagination_limit
)
Or CSV import into existing lists as "patch"
"""
def
get
(
self
,
request
):
params
=
get_parameters
(
request
)
corpus_id
=
int
(
params
.
pop
(
"corpus"
))
corpus_node
=
cache
.
Node
[
corpus_id
]
if
pagination_offset
:
query
=
query
.
offset
(
pagination_offsets
)
# response is file-like + headers
response
=
HttpResponse
(
content_type
=
'text/csv'
)
response
[
'Content-Disposition'
]
=
'attachment; filename="corpus-
%
i_gargantext_term_list.csv"'
%
corpus_id
return
query
# fill the response with the data
export_ngramlists
(
corpus_node
,
fname
=
response
,
titles
=
True
)
return
response
def
post
(
self
,
request
):
"""
Merge the lists of a corpus with other lists from a CSV source
or from another corpus
params in request.GET:
corpus: the corpus whose lists are getting patched
params in request.FILES:
csvsource: the csv file
def
_query_grouped_ngrams
(
groupings_id
,
details
=
False
,
scoring_metric_id
=
None
):
"""
Listing of "hidden" ngram_ids from the groups
or in get
dbsource: another corpus instead of the csvfile
(? this last option should perhaps not be in CSVLists ?)
Works only for grouplists
NB: not using PATCH because we'll need POST file upload
Parameter:
- details: if False, send just the array of ngram_ids
if True, send triples with (ngram_id, term, scoring)
^^^^^^^
deprecated: scoring_metric_id: id of a scoring metric node (TFIDF or OCCS)
(for details and sorting)
(no more OCCS counts of subforms)
"""
if
not
details
:
# simple contents
query
=
session
.
query
(
NodeNgramNgram
.
ngram2_id
)
else
:
# detailed contents (terms and some NodeNodeNgram for score)
query
=
(
session
.
query
(
NodeNgramNgram
.
ngram2_id
,
Ngram
.
terms
,
# NodeNodeNgram.score #
)
.
join
(
Ngram
,
NodeNgramNgram
.
ngram2_id
==
Ngram
.
id
)
# .join(NodeNodeNgram, NodeNgramNgram.ngram2_id == NodeNodeNgram.ngram_id)
# .filter(NodeNodeNgram.node1_id == scoring_metric_id)
# .order_by(desc(NodeNodeNgram.score))
)
/!
\
We assume we checked the file size client-side before upload
# main filter
# -----------
query
=
query
.
filter
(
NodeNgramNgram
.
node_id
==
groupings_id
)
£TODO check authentication and user.id
"""
# this time the corpus param is the one with the target lists to be patched
params
=
get_parameters
(
request
)
corpus_id
=
int
(
params
.
pop
(
"onto_corpus"
))
corpus_node
=
cache
.
Node
[
corpus_id
]
return
query
# request also contains the file
# csv_file has type django.core.files.uploadedfile.InMemoryUploadedFile
# ----------------------
csv_file
=
request
.
data
[
'csvfile'
]
# import the csv
new_lists
=
import_ngramlists
(
csv_file
)
del
csv_file
# merge the new_lists onto those of the target corpus
log_msg
=
merge_ngramlists
(
new_lists
,
onto_corpus
=
corpus_node
)
return
JsonHttpResponse
({
'log'
:
log_msg
,
},
200
)
class
List
(
APIView
):
"""
see already available API query api/nodes/<list_id>?fields[]=ngrams
"""
pass
class
GroupChange
(
APIView
):
...
...
@@ -441,7 +411,7 @@ class MapListGlance(APIView):
listmembers
=
{
'maplist'
:[]}
# ngram ids sorted per list name
# infos for all ngrams from maplist
map_ngrams
=
_
query_list
(
maplist_id
,
details
=
True
,
map_ngrams
=
query_list
(
maplist_id
,
details
=
True
,
scoring_metric_id
=
scores_id
)
.
all
()
# ex: [(8805, 'mean age', 4.0),
...
...
@@ -566,25 +536,25 @@ class ListFamily(APIView):
if
"head"
in
parameters
:
# head <=> only mainlist AND only k top ngrams
glance_limit
=
int
(
parameters
[
'head'
])
mainlist_query
=
_
query_list
(
mainlist_id
,
details
=
True
,
mainlist_query
=
query_list
(
mainlist_id
,
details
=
True
,
pagination_limit
=
glance_limit
,
scoring_metric_id
=
scores_id
)
else
:
# infos for all ngrams from mainlist
mainlist_query
=
_
query_list
(
mainlist_id
,
details
=
True
,
mainlist_query
=
query_list
(
mainlist_id
,
details
=
True
,
scoring_metric_id
=
scores_id
)
# infos for grouped ngrams, absent from mainlist
hidden_ngrams_query
=
_
query_grouped_ngrams
(
groups_id
,
details
=
True
,
hidden_ngrams_query
=
query_grouped_ngrams
(
groups_id
,
details
=
True
,
scoring_metric_id
=
scores_id
)
# infos for stoplist terms, absent from mainlist
stop_ngrams_query
=
_
query_list
(
other_list_ids
[
'stoplist'
],
details
=
True
,
stop_ngrams_query
=
query_list
(
other_list_ids
[
'stoplist'
],
details
=
True
,
scoring_metric_id
=
scores_id
)
# and for the other lists (stop and map)
# no details needed here, just the member ids
for
li
in
other_list_ids
:
li_elts
=
_
query_list
(
other_list_ids
[
li
],
details
=
False
li_elts
=
query_list
(
other_list_ids
[
li
],
details
=
False
)
.
all
()
# simple array of ngram_ids
listmembers
[
li
]
=
[
ng
[
0
]
for
ng
in
li_elts
]
...
...
gargantext/views/api/urls.py
View file @
3a445e73
...
...
@@ -27,6 +27,15 @@ urlpatterns = [ url(r'^nodes$' , nodes.NodeListResource.as_view()
# \
# corpus id
,
url
(
r'^ngramlists/export$'
,
ngramlists
.
CSVLists
.
as_view
()
)
# get a CSV export of the ngramlists of a corpus
# ex: GET ngramlists/export?corpus=43
# TODO : unify to a /api/ngrams?formatted=csv
# (similar to /api/nodes?formatted=csv)
,
url
(
r'^ngramlists/import$'
,
ngramlists
.
CSVLists
.
as_view
()
)
# same handling class as export (CSVLists)
# but this route used only for POST + file
,
url
(
r'^ngramlists/change$'
,
ngramlists
.
ListChange
.
as_view
()
)
# add or remove ngram from a list
...
...
gargantext/views/pages/terms.py
View file @
3a445e73
...
...
@@ -33,6 +33,9 @@ def ngramtable(request, project_id, corpus_id):
'project'
:
project
,
'corpus'
:
corpus
,
'resourcename'
:
resourcename
(
corpus
),
'view'
:
'terms'
'view'
:
'terms'
,
# for the CSV import modal
'csvimportroute'
:
"/api/ngramlists/import?onto_corpus=
%
i"
%
corpus
.
id
},
)
install/python/requirements.txt
View file @
3a445e73
...
...
@@ -11,6 +11,7 @@ django-pgfields==1.4.4
django-pgjsonb==0.0.16
djangorestframework==3.3.2
html5lib==0.9999999
python-igraph>=0.7.1
jdatetime==1.7.2
kombu==3.0.33 # messaging
nltk==3.1
...
...
static/lib/gargantext/menu.css
View file @
3a445e73
...
...
@@ -19,3 +19,13 @@
line-height
:
.85
;
margin-bottom
:
-5px
;
}
.exportbtn
{
/* border: 1px solid #333 ; */
margin-top
:
17px
;
/* valigns with bootstrap h2 */
}
.btn
.glyphicon
{
/* glyphicons are always rendered too high within bootstrap buttons */
vertical-align
:
middle
}
templates/pages/corpora/terms.html
View file @
3a445e73
...
...
@@ -72,6 +72,15 @@
<button
id=
"Save_All"
class=
"btn btn-muted"
disabled
style=
"font-size:120%"
>
<b>
Save all changes
</b>
</button>
<br/>
<br/>
<!-- import icon -->
<span
class=
"needsaveicon glyphicon glyphicon-import"
></span>
<button
id=
"ImportList"
class=
"btn btn-warning"
style=
"font-size:120%"
onclick=
"$('#csvimport').modal('show');"
>
<b>
Import a Termlist
</b>
</button>
</div>
<!-- see in javascript function queries.functions['my_state_filter'] -->
<div
class=
"pull-right"
style=
"margin-top:2.1em;padding-left:1em;"
>
...
...
@@ -107,25 +116,110 @@
</div>
<!-- /div panel -->
</div>
<!-- /jumbotron -->
<!--
<button id="ImportList" onclick="GetUserPortfolio();" class="btn btn-warning">
Import a Corpus-List
</button>
-->
<!--</div> This div is closed in the menu !-->
<!--</div> This div is closed in the menu !-->
<!--
# stub to import a list (aka orange button)
<button id="ImportList" onclick="GetUserPortfolio();" class="btn btn-warning">Import a Corpus-List</button>
-->
<div
class=
"modal"
aria-hidden=
"true"
id=
"csvimport"
>
<div
class=
"modal-dialog"
>
<div
class=
"modal-content"
>
<div
class=
"modal-header"
>
<button
type=
"button"
class=
"close"
data-dismiss=
"modal"
aria-hidden=
"true"
>
×
</button>
<h3
id=
"myModalLabel"
>
Import a CSV term list
</h3>
</div>
<div
class=
"modal-body"
id=
"uploadform"
>
<form
id=
"csvimportform"
onsubmit=
"return postCSV(event)"
enctype=
"multipart/form-data"
method=
"post"
>
{% csrf_token %}
<label>
From your disk:
</label>
<input
type=
"file"
id=
"csvfile"
accept=
"text/csv"
>
<br/>
<label>
From another corpus:
</label>
<p>
TODO
</p>
<br/>
<input
type=
"submit"
class=
"btn btn-xs btn-info"
id=
"csvsubmit"
value=
"Submit"
/>
</form>
</div>
<div
class=
"modal-footer"
id=
"formanswer"
></div>
</div>
</div>
</div>
<script
type=
"text/javascript"
src=
"{% static "
lib
/
jquery
/
dynatable
/
jquery
.
dynatable
.
js
"
%}"
></script>
<!-- custom-lib for dynatable.js and dc.js -->
<script
type=
"text/javascript"
src=
"{% static "
lib
/
gargantext
/
NGrams_dyna_chart_and_table
.
js
"
%}"
></script>
<script
type=
"text/javascript"
>
/* merci c24b !
* Uses csvimportroute variable from the django template
* Ex: /api/ngramlists/import?onto_corpus=corpus_id
*
* Uses input#csvfile as source data.
*/
function
postCSV
(
e
){
// don't do page reload of usual submits
e
.
preventDefault
()
// 2MB ≈ 70000 ngrams
var
max_size
=
2097152
// we take it straight from the input element
theFile
=
$
(
'input#csvfile'
)[
0
].
files
[
0
]
// debug
// console.log(theFile.name, "size", theFile.size, theFile.lastModifiedDate)
if
(
!
theFile
)
{
console
.
warn
(
'Ignoring "submit": no provided file'
)
return
false
}
else
if
(
theFile
.
size
>
max_size
)
{
console
.
warn
(
'Ignoring "submit": file is too big'
)
$
(
'#formanswer'
).
html
(
'The import failed: your file is too big ('
+
max_size
/
1024
+
'kB max).'
);
return
false
}
// normal case
else
{
// append into an empty form (or fixme: initialize it using form element)
var
myFileFormData
=
new
FormData
();
myFileFormData
.
append
(
"csvfile"
,
theFile
)
//postCorpusFile
$
.
ajax
({
url
:
"{{csvimportroute | safe}}"
,
type
:
'POST'
,
async
:
true
,
contentType
:
false
,
processData
:
false
,
data
:
myFileFormData
,
beforeSend
:
function
(
xhr
)
{
xhr
.
setRequestHeader
(
"X-CSRFToken"
,
getCookie
(
"csrftoken"
));
},
success
:
function
(
response
)
{
my_html
=
"<h2 color='green'>IMPORT OK ! </h2>"
my_html
+=
"<p class='note'>"
+
response
[
'log'
].
replace
(
/
\n
/g
,
'<br/>'
)
+
"</p>"
my_html
+=
"<p'>(this page will reload in 3s)</p>"
$
(
'#formanswer'
).
html
(
my_html
);
console
.
log
(
response
)
;
// reload after 3s
setTimeout
(
"location.reload(true)"
,
3000
);
},
error
:
function
(
result
)
{
$
(
'#formanswer'
).
html
(
'Erreur'
);
console
.
error
(
result
);
},
});
$
(
'#formanswer'
).
html
(
'CSV import in Progress'
);
}
};
</script>
{% endblock %}
templates/pages/menu.html
View file @
3a445e73
...
...
@@ -41,7 +41,7 @@
{% if corpus %}
<li><a
href=
"/projects/{{project.id}}/corpora/{{corpus.id}}"
>
<span
class=
"glyphicon glyphicon-file"
aria-hidden=
"true"
></span>
{{corpus.name | truncatechars:
1
5}}
{{corpus.name | truncatechars:
2
5}}
</a>
</li>
{% endif %}
...
...
@@ -150,12 +150,32 @@
<br>
<br>
<div
class=
"row"
>
<h3>
<a
href=
"/projects/{{project.id}}"
>
<span
class=
"glyphicon glyphicon-book"
aria-hidden=
"true"
></span>
{{ project.name | truncatechars:50}}
<div
class=
"col-md-6"
>
<h3>
<a
href=
"/projects/{{project.id}}"
>
<span
class=
"glyphicon glyphicon-book"
aria-hidden=
"true"
></span>
{{ project.name | truncatechars:50}}
</a>
</h3>
</div>
<!-- export button -->
<div
class=
"col-md-6"
>
{% if view == 'terms' %}
<a
class=
"btn btn-primary exportbtn pull-right"
role=
"button"
href=
"/api/ngramlists/export?corpus={{corpus.id}}"
title=
"Export terms table in CSV"
>
Export terms table
<span
class=
"glyphicon glyphicon-download"
aria-hidden=
"true"
></span>
</a>
{% elif view == 'titles' %}
<a
class=
"btn btn-primary exportbtn pull-right"
role=
"button"
href=
"/api/nodes?parent_id={{corpus.id}}&types[]=DOCUMENT&pagination_limit=100000&formated=csv"
title=
"Export full corpus in CSV"
>
Export corpus
<span
class=
"glyphicon glyphicon-download"
aria-hidden=
"true"
></span>
</a>
</h3>
{% else %}
<!-- TODO export journal table -->
{% endif %}
</div>
</div>
<div
class=
"row"
>
<div
class=
"col-md-1"
>
...
...
@@ -167,10 +187,7 @@
</h3>
<h3>
<span
class=
"glyphicon glyphicon-file"
aria-hidden=
"true"
></span>
{{ corpus.name | truncatechars:20 }}
<a
class=
"btn btn-primary"
role=
"button"
href=
"/api/nodes?parent_id={{corpus.id}}&types[]=DOCUMENT&pagination_limit=100000&formated=csv"
>
<span
class=
"glyphicon glyphicon-download"
aria-hidden=
"true"
></span>
</a>
{{ corpus.name | truncatechars:30 }}
</h3>
</div>
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment