Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
75d4a738
Commit
75d4a738
authored
Jun 21, 2016
by
delanoe
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'romain-goodies' into unstable
parents
3a445e73
aacd15ef
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
347 additions
and
145 deletions
+347
-145
constants.py
gargantext/constants.py
+3
-0
ngramlists_tools.py
gargantext/util/ngramlists_tools.py
+215
-129
ngramlists.py
gargantext/views/api/ngramlists.py
+24
-11
tables.css
static/lib/gargantext/tables.css
+1
-1
terms.html
templates/pages/corpora/terms.html
+104
-4
No files found.
gargantext/constants.py
View file @
75d4a738
...
@@ -250,6 +250,9 @@ DEFAULT_INDEX_SUBGRAMS = False # False <=> traditional
...
@@ -250,6 +250,9 @@ DEFAULT_INDEX_SUBGRAMS = False # False <=> traditional
# at indexing after extraction)
# at indexing after extraction)
# ngram lists import/export parameters -----------------------------------------
DEFAULT_CSV_DELIM
=
'
\t
'
# for import/export CSV defaults
DEFAULT_CSV_DELIM_GROUP
=
'|&|'
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
...
...
gargantext/util/ngramlists_tools.py
View file @
75d4a738
...
@@ -15,6 +15,8 @@ from gargantext.models import Ngram, NodeNgram, NodeNodeNgram, \
...
@@ -15,6 +15,8 @@ from gargantext.models import Ngram, NodeNgram, NodeNodeNgram, \
from
gargantext.util.lists
import
UnweightedList
,
Translations
from
gargantext.util.lists
import
UnweightedList
,
Translations
from
gargantext.constants
import
DEFAULT_CSV_DELIM
,
DEFAULT_CSV_DELIM_GROUP
# import will implement the same text cleaning procedures as toolchain
# import will implement the same text cleaning procedures as toolchain
from
gargantext.util.toolchain.parsing
import
normalize_chars
from
gargantext.util.toolchain.parsing
import
normalize_chars
from
gargantext.util.toolchain.ngrams_extraction
import
normalize_terms
from
gargantext.util.toolchain.ngrams_extraction
import
normalize_terms
...
@@ -23,7 +25,7 @@ from sqlalchemy.sql import exists
...
@@ -23,7 +25,7 @@ from sqlalchemy.sql import exists
from
os
import
path
from
os
import
path
from
csv
import
writer
,
reader
,
QUOTE_MINIMAL
from
csv
import
writer
,
reader
,
QUOTE_MINIMAL
from
collections
import
defaultdict
from
collections
import
defaultdict
from
re
import
match
from
re
import
match
,
findall
from
io
import
StringIO
# pseudo file to write CSV to memory
from
io
import
StringIO
# pseudo file to write CSV to memory
def
query_list
(
list_id
,
def
query_list
(
list_id
,
...
@@ -124,57 +126,71 @@ def query_list(list_id,
...
@@ -124,57 +126,71 @@ def query_list(list_id,
# helper func for exports
# helper func for exports
def
ngrams_to_csv_rows
(
ngram_objs
,
id_groupings
=
{},
list_type
=
""
):
def
ngrams_to_csv_rows
(
ngram_objs
,
ngram_dico
=
{},
group_infos
=
{},
list_type
=
""
,
groupings_delim
=
DEFAULT_CSV_DELIM_GROUP
):
"""
"""
@param: ngram_objs
@param: ngram_objs
an array of ngrams (eg: from a db query.all())
an array of ngrams (eg: from a db query.all())
@param: optional id_groupings
@param: optional group_infos as links and subs
a dict of sets {mainform_id : {subform_idA, subform_idB, etc}}
ginfos{links} = a dict of sets
{mainform_id : {subform_idA, subform_idB, etc}}
ginfos{subs} = a reverse map
{subform_idA:mainform_id, subform_idB:mainform_id, etc}}
@param: list_type (a str 'map','main' or 'stop' to fill in col 4)
@param: list_type (a str 'map','main' or 'stop' to fill in col 4)
Outputs a
basic
info table per ngram
Outputs a
condensed
info table per ngram
(
ng_id, term string, term size, list_type
)
(
list_type, "term string"
)
with an optional 5th column of grouped subforms ex: "4|42"
with an optional 3rd column of grouped subforms
ex: "othertermstring|yetanothertermstring"
Returns format is a csv_rows matrix (as a list of lists)
Returns format is a csv_rows matrix (as a list of lists)
[
[
[
ligne1_colA, ligne
1_colB..],
[
row1_colA, row
1_colB..],
[
ligne2_colA, ligne
2_colB..],
[
row2_colA, row
2_colB..],
..
..
]
]
(to be used for instance like: csv.writer.writerows(csv_rows)
(to be used for instance like: csv.writer.writerows(csv_rows)
list_type ici:
list_type ici:
0 <=> stop
List
0 <=> stop
1 <=> miam
List
1 <=> miam
2 <=> map
List
2 <=> map
"""
"""
# transcri
re les objets ngrammes en tableau (liste de liste
s)
# transcri
be ngram objects to a table (array of row-array
s)
csv_rows
=
list
()
csv_rows
=
list
()
for
ng_obj
in
ngram_objs
:
for
ng_obj
in
ngram_objs
:
ng_id
=
ng_obj
.
id
ng_id
=
ng_obj
.
id
if
ng_id
in
id_groupings
.
keys
():
# only mainforms will get their own row
this_grouped
=
"|"
.
join
(
str
(
gid
)
for
gid
in
id_groupings
[
ng_id
])
if
ng_id
not
in
group_infos
[
'subs'
]:
else
:
this_grouped
=
""
# if has subforms
if
ng_id
in
group_infos
[
'links'
]:
this_grouped_terms
=
groupings_delim
.
join
(
# we replace grouped_ids by their terms string
[
ngram_dico
[
subf_id
]
for
subf_id
in
group_infos
[
'links'
][
ng_id
]]
)
# if no subforms
else
:
this_grouped_terms
=
""
# transcription : 5 columns
# transcription :
# ID , terme , n , type_de_liste , grouped_id|grouped_id...
# 3 columns = |status, | mainform, | forms
# (type_of_list) ( term ) ( subterm1|&|subterm2 )
csv_rows
.
append
(
csv_rows
.
append
(
[
ng_id
,
ng_obj
.
terms
,
ng_obj
.
n
,
list_type
,
this_grouped
]
[
list_type
,
ng_obj
.
terms
,
this_grouped_terms
]
)
)
return
csv_rows
return
csv_rows
def
export_ngramlists
(
node
,
fname
=
None
,
delimiter
=
"
\t
"
,
titles
=
Fals
e
):
def
export_ngramlists
(
node
,
fname
=
None
,
delimiter
=
DEFAULT_CSV_DELIM
,
titles
=
Tru
e
):
"""
"""
export of the 3 lists under a corpus node (MAP, MAIN, STOP)
export of the 3 lists under a corpus node (MAP, MAIN, STOP)
with local combination of groups
with local combination of groups
...
@@ -189,10 +205,9 @@ def export_ngramlists(node,fname=None,delimiter="\t",titles=False):
...
@@ -189,10 +205,9 @@ def export_ngramlists(node,fname=None,delimiter="\t",titles=False):
@param titles: optional flag to print or not a first line with headers
@param titles: optional flag to print or not a first line with headers
# ID , term , nwords , list_type , grouped_id|grouped_id...
status label forms
1622 textile 1 main 1623|3397
map textile textiles|&|textile production
3397 textile production 2 main
stop possibility
3410 possibility 1 stop
TODO : REFACTOR split list logic from corpus logic
TODO : REFACTOR split list logic from corpus logic
=> possibility to act on one list
=> possibility to act on one list
...
@@ -211,7 +226,6 @@ def export_ngramlists(node,fname=None,delimiter="\t",titles=False):
...
@@ -211,7 +226,6 @@ def export_ngramlists(node,fname=None,delimiter="\t",titles=False):
# et les groupes de synonymes
# et les groupes de synonymes
group_node
=
node
.
children
(
"GROUPLIST"
)
.
first
()
group_node
=
node
.
children
(
"GROUPLIST"
)
.
first
()
# listes de ngram_ids correspondantes
# listes de ngram_ids correspondantes
# ------------------------------------
# ------------------------------------
# contenu: liste des objets ngrammes [(2562,"monterme",1),...]
# contenu: liste des objets ngrammes [(2562,"monterme",1),...]
...
@@ -219,40 +233,56 @@ def export_ngramlists(node,fname=None,delimiter="\t",titles=False):
...
@@ -219,40 +233,56 @@ def export_ngramlists(node,fname=None,delimiter="\t",titles=False):
main_ngrams
=
query_list
(
mainlist_node
.
id
,
details
=
True
,
groupings_id
=
group_node
.
id
)
.
all
()
main_ngrams
=
query_list
(
mainlist_node
.
id
,
details
=
True
,
groupings_id
=
group_node
.
id
)
.
all
()
map_ngrams
=
query_list
(
maplist_node
.
id
,
details
=
True
,
groupings_id
=
group_node
.
id
)
.
all
()
map_ngrams
=
query_list
(
maplist_node
.
id
,
details
=
True
,
groupings_id
=
group_node
.
id
)
.
all
()
# pour debug ---------->8 --------------------
# pour debug ---------->8 --------------------
#~ stop_ngrams = stop_ngrams[0:10]
#~ stop_ngrams = stop_ngrams[0:10]
#~ main_ngrams = main_ngrams[0:10]
#~ main_ngrams = main_ngrams[0:10]
#~ map_ngrams = map_ngrams[0:10]
#~ map_ngrams = map_ngrams[0:10]
# --------------------->8 --------------------
# --------------------->8 --------------------
# pour la group_list on a des couples de ngram_ids
# preloop to fill a local copy of dictionary ng_id => ng_term_str
dico
=
{}
for
li
in
[
stop_ngrams
,
main_ngrams
,
map_ngrams
]:
for
(
ngid
,
ngterm
,
ignored
)
in
li
:
dico
[
ngid
]
=
ngterm
# for the groups we got couples of ids in the DB
# -------------------
# -------------------
# ex: [(3544, 2353), (2787, 4032), ...]
# ex: [(3544, 2353), (2787, 4032), ...]
group_ngram_id_couples
=
query_groups
(
group_node
.
id
)
.
all
()
group_ngram_id_couples
=
query_groups
(
group_node
.
id
)
.
all
()
# k couples comme set
# we expend this to double structure for groups lookup
# --------------------
# 1) g['links'] = k couples (x,y_i) as a set [x => {y1,y2}]
# [(x => y1), (x => y2)] >~~~~~~~> [x => {y1,y2}]
grouped
=
defaultdict
(
set
)
# 2) g['subs'] = reverse map like translations [(y1 => x), (y2 => x)]
g
=
{
"links"
:
defaultdict
(
set
),
"subs"
:
defaultdict
(
int
)
}
for
ngram
in
group_ngram_id_couples
:
for
ngram
in
group_ngram_id_couples
:
grouped
[
ngram
[
0
]]
.
add
(
ngram
[
1
])
x
=
int
(
ngram
[
0
])
y
=
int
(
ngram
[
1
])
g
[
'links'
][
x
]
.
add
(
y
)
g
[
'subs'
][
y
]
=
x
# on applique notre fonction ng_to_csv sur chaque liste
# on applique notre fonction ng_to_csv sur chaque liste
# ------------------------------------------------------
# ------------------------------------------------------
map_csv_rows
=
ngrams_to_csv_rows
(
map_ngrams
,
map_csv_rows
=
ngrams_to_csv_rows
(
map_ngrams
,
id_groupings
=
grouped
,
ngram_dico
=
dico
,
group_infos
=
g
,
list_type
=
"map"
)
list_type
=
"map"
)
stop_csv_rows
=
ngrams_to_csv_rows
(
stop_ngrams
,
stop_csv_rows
=
ngrams_to_csv_rows
(
stop_ngrams
,
id_groupings
=
grouped
,
ngram_dico
=
dico
,
group_infos
=
g
,
list_type
=
"stop"
)
list_type
=
"stop"
)
# miam contient map donc il y a un préalable ici
# miam contient map donc il y a un préalable ici
map_ngram_ids
=
{
ng
.
id
for
ng
in
map_ngrams
}
map_ngram_ids
=
{
ng
.
id
for
ng
in
map_ngrams
}
main_without_map
=
[
ng
for
ng
in
main_ngrams
if
ng
.
id
not
in
map_ngram_ids
]
main_without_map
=
[
ng
for
ng
in
main_ngrams
if
ng
.
id
not
in
map_ngram_ids
]
miam_csv_rows
=
ngrams_to_csv_rows
(
main_without_map
,
miam_csv_rows
=
ngrams_to_csv_rows
(
main_without_map
,
id_groupings
=
grouped
,
ngram_dico
=
dico
,
group_infos
=
g
,
list_type
=
"main"
)
list_type
=
"main"
)
# all lists together now
# all lists together now
...
@@ -273,7 +303,7 @@ def export_ngramlists(node,fname=None,delimiter="\t",titles=False):
...
@@ -273,7 +303,7 @@ def export_ngramlists(node,fname=None,delimiter="\t",titles=False):
quoting
=
QUOTE_MINIMAL
)
quoting
=
QUOTE_MINIMAL
)
if
titles
:
if
titles
:
csv_wr
.
writerow
([
"
oldid"
,
"term"
,
"nwords"
,
"listtype"
,
"sub
forms"
])
csv_wr
.
writerow
([
"
status"
,
"label"
,
"
forms"
])
# write to outfile
# write to outfile
csv_wr
.
writerows
(
this_corpus_all_rows
)
csv_wr
.
writerows
(
this_corpus_all_rows
)
...
@@ -294,7 +324,8 @@ def export_ngramlists(node,fname=None,delimiter="\t",titles=False):
...
@@ -294,7 +324,8 @@ def export_ngramlists(node,fname=None,delimiter="\t",titles=False):
def
import_ngramlists
(
fname
,
delimiter
=
'
\t
'
,
group_delimiter
=
'|'
):
def
import_ngramlists
(
fname
,
delimiter
=
DEFAULT_CSV_DELIM
,
group_delimiter
=
DEFAULT_CSV_DELIM_GROUP
):
'''
'''
This function reads a CSV of an ngrams table for a Corpus,
This function reads a CSV of an ngrams table for a Corpus,
then it converts old ngram_ids to those of the current DB
then it converts old ngram_ids to those of the current DB
...
@@ -302,23 +333,61 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
...
@@ -302,23 +333,61 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
then recreates an equivalent set of MAINLIST, MAPLIST, STOPLIST + GROUPS
then recreates an equivalent set of MAINLIST, MAPLIST, STOPLIST + GROUPS
Input example:
Input example:
oldid | term |nwords| ltype |group_oldids
status | label |forms
-------+---------------+------+--------+---------------
--------+---------------+---------------------
3842 water table 2 map 3724
map water table water tables
3724 water tables 2 map
map water supply water-supply|&|water supplies
4277 water supply 2 map 190362|13415
stop wastewater
13415 water supplies 2 map
190362 water-supply 1 map
The title line is mandatory.
20489 wastewater 1 map
The label will correspond to our DB mainform type.
Variants:
----------
For user accessibility, we allow different formats using equivalence rules:
1) It is implicit that the label string is also one of the forms
therefore the input example table is equivalent to this "verbose" table:
status | label |forms
--------+---------------+---------------------
map water table water table|&|water tables
map water supply water supply|&|water-supply|&|water supplies
stop wastewater wastewater
2) The default status is map and the status column is optional
thus, if we ignore "wastewater", the input table is also equivalent to:
label |forms
---------------+---------------------
water table water tables
water supply water-supply|&|water supplies
3) From DB point of view, both "forms that are labels" and "other forms" are
finally saved just as ngrams. So the input table is also equivalent to:
status | label |forms
--------+---------------+---------------------
map water table water tables
map water tables
map water supply water-supply|&|water supplies
map water supplies
map water-supply
stop wastewater
Output: 3 x UnweightedList + 1 x Translations
Output:
-------
3 x UnweightedList + 1 x Translations
@param fname a local filename or a filehandle-like
@param fname a local filename or a filehandle-like
@param delimiter a character used as separator in the CSV
@param delimiter a character used as separator in the CSV
@param group_delimiter a character used as grouped subforms separator
@param group_delimiter a character used as grouped subforms separator
(in the last column)
(in the last column)
The
conversion of old_id to ngram_id
works in 2 steps:
The
retrieval of ngram_ids
works in 2 steps:
=> look up each term str in the DB with bulk_insert_ifnotexists
=> look up each term str in the DB with bulk_insert_ifnotexists
(creates absent ngrams if necessary)
(creates absent ngrams if necessary)
=> use the new ids to map the relations involving the old ones
=> use the new ids to map the relations involving the old ones
...
@@ -328,28 +397,21 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
...
@@ -328,28 +397,21 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
NB: To merge the imported lists into a corpus node's lists,
NB: To merge the imported lists into a corpus node's lists,
chain this function with merge_ngramlists()
chain this function with merge_ngramlists()
'''
'''
# --------------
# --------------
-
#
#
ngram storage
# --------------
# --------------
-
# main storage for the ngrams by list
# main storage for the ngrams by list
import_nodes_ngrams
=
{
'stop'
:[],
'main'
:[],
'map'
:[]}
import
ed
_nodes_ngrams
=
{
'stop'
:[],
'main'
:[],
'map'
:[]}
# separate storage for the term's couples [(term str, nwords int),...]
# and all the terms (for unique and for dbdata bulk_insert)
imported_ngrams_dbdata
=
[]
imported_unique_ngramstrs
=
{}
# and all the old ids, by term (for id lookup after dbdata bulk_insert)
imported_ngrams_oldids
=
{}
# and for the imported_grouping list of couples [(
x1,y1),(x1,y2),(x2,y3),
..]
# and for the imported_grouping list of couples [(
str1,str1),(str1,str2)
..]
imported_groupings
=
[]
imported_groupings
=
[]
# /!\ imported_grouping contains only external ids (aka oldids)
# /!\ imported_grouping contains the subforms' terms themselves
# (ie imported ids.. that will have to be translated
# (that will have to be translated to ngram_ids for the target db)
# to target db ids)
# skipped lines can (very rarely) be used in groups => mark as ignored
ignored_oldids
=
[]
# =============== READ CSV ===============
# =============== READ CSV ===============
...
@@ -382,77 +444,105 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
...
@@ -382,77 +444,105 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
n_added_ng
=
0
n_added_ng
=
0
n_group_relations
=
0
n_group_relations
=
0
# columntype => int
columns
=
{}
# load CSV + initial checks
# load CSV + initial checks
for
i
,
csv_row
in
enumerate
(
ngrams_csv_rows
):
for
i
,
csv_row
in
enumerate
(
ngrams_csv_rows
):
# fyi
# fyi
n_read_lines
+=
1
n_read_lines
+=
1
# print("---------------READ LINE %i" % i)
# print("---------------READ LINE %i" % i)
# headers
if
i
==
0
:
n_cols
=
len
(
csv_row
)
for
j
,
colname
in
enumerate
(
csv_row
):
if
colname
in
[
'label'
,
'status'
,
'forms'
]:
columns
[
colname
]
=
j
else
:
raise
ValueError
(
'Wrong header "
%
s" on line
%
i (only possible headers are "label", "forms" and "status")'
%
(
colname
,
n_read_lines
))
if
'label'
not
in
columns
:
raise
ValueError
(
'CSV must contain at least one column with the header "label"'
)
if
not
len
(
csv_row
):
if
not
len
(
csv_row
):
continue
continue
try
:
# try:
this_ng_oldid
=
str
(
csv_row
[
0
])
# mandatory column
this_ng_term
=
str
(
csv_row
[
1
])
this_row_label
=
str
(
csv_row
[
columns
[
'label'
]])
this_ng_nwords
=
int
(
csv_row
[
2
])
this_list_type
=
str
(
csv_row
[
3
])
this_ng_group
=
str
(
csv_row
[
4
])
# string normalizations
this_ng_term
=
normalize_terms
(
normalize_chars
(
this_ng_term
))
except
:
# other columns or their default values
if
i
==
0
:
if
'status'
in
columns
:
print
(
"IMPORT WARN: (skip line) probable header line at CSV
%
s:l.0"
%
fname
)
this_list_type
=
str
(
csv_row
[
columns
[
'status'
]])
continue
else
:
else
:
this_list_type
=
'map'
raise
ValueError
(
"Error on CSV read line
%
i"
%
n_read_lines
)
# --- check format before any old ID retrieve
if
'forms'
in
columns
:
if
not
match
(
r"\d+$"
,
this_ng_oldid
):
this_row_forms
=
str
(
csv_row
[
columns
[
'forms'
]])
print
(
"IMPORT WARN: (skip line) bad ID at CSV
%
s:l.
%
i"
%
(
fname
,
i
))
continue
else
:
else
:
this_ng_oldid
=
int
(
this_ng_oldid
)
this_row_forms
=
''
# string normalizations
this_row_label
=
normalize_terms
(
normalize_chars
(
this_row_label
))
# except:
# if i == 0:
# print("IMPORT WARN: (skip line) probable header line at CSV %s:l.0" % fname)
# continue
# else:
# raise ValueError("Error on CSV read line %i" % i)
# --- term checking
# --- term checking
if
not
len
(
this_
ng_term
)
>
0
:
if
not
len
(
this_
row_label
)
>
0
:
print
(
"IMPORT WARN: (skip line) empty term at CSV
%
s:l.
%
i"
%
(
fname
,
i
))
print
(
"IMPORT WARN: (skip line) empty term at CSV
%
s:l.
%
i"
%
(
fname
,
i
))
ignored_oldids
.
append
(
this_ng_oldid
)
continue
# --- check if not a duplicate string
if
this_ng_term
in
imported_ngrams_oldids
:
ignored_oldids
.
append
(
this_ng_oldid
)
print
(
"IMPORT WARN: (skip line) term appears more than once (previous id:
%
i) at CSV
%
s:l.
%
i"
%
(
imported_ngrams_oldids
[
this_ng_term
],
fname
,
i
))
continue
continue
# --- check correct list type
# --- check correct list type
if
not
this_list_type
in
[
'stop'
,
'main'
,
'map'
]:
if
not
this_list_type
in
[
'stop'
,
'main'
,
'map'
]:
ignored_oldids
.
append
(
this_ng_oldid
)
print
(
"IMPORT WARN: (skip line) wrong list type at CSV
%
s:l.
%
i"
%
(
fname
,
i
))
print
(
"IMPORT WARN: (skip line) wrong list type at CSV
%
s:l.
%
i"
%
(
fname
,
i
))
continue
continue
# subforms can be duplicated (in forms and another label)
# but we must take care of unwanted other duplicates too
if
this_row_label
in
imported_unique_ngramstrs
:
print
(
"TODO IMPORT DUPL: (skip line) term appears more than once at CSV
%
s:l.
%
i"
%
(
fname
,
i
))
# ================= Store the data ====================
# ================= Store the data ====================
# the ngram data
# the ngram census
imported_ngrams_dbdata
.
append
([
this_ng_term
,
this_ng_nwords
])
imported_unique_ngramstrs
[
this_row_label
]
=
True
imported_ngrams_oldids
[
this_ng_term
]
=
this_ng_oldid
# and the "list to ngram" relation
# and the "list to ngram" relation
import
_nodes_ngrams
[
this_list_type
]
.
append
(
this_ng_oldid
)
import
ed_nodes_ngrams
[
this_list_type
]
.
append
(
this_row_label
)
# ====== Store synonyms from the import (if any) ======
# ====== Store synonyms from the import (if any) ======
if
len
(
this_ng_group
)
!=
0
:
if
len
(
this_row_forms
)
!=
0
:
group_as_external_ids
=
this_ng_group
.
split
(
'|'
)
other_terms
=
[]
for
raw_term_str
in
this_row_forms
.
split
(
group_delimiter
):
for
external_subform_id
in
group_as_external_ids
:
external_subform_id
=
int
(
external_subform_id
)
# each subform is also like an ngram declaration
imported_groupings
.
append
(
term_str
=
normalize_terms
(
normalize_chars
(
raw_term_str
))
(
this_ng_oldid
,
external_subform_id
)
imported_unique_ngramstrs
[
term_str
]
=
True
)
imported_nodes_ngrams
[
this_list_type
]
.
append
(
term_str
)
# the optional repeated mainform doesn't interest us
# because we already have it via the label
if
term_str
!=
this_row_label
:
# save links
imported_groupings
.
append
(
(
this_row_label
,
term_str
)
)
# ======== ngram save + id lookup =========
# ======== ngram save + id lookup =========
n_total_ng
=
len
(
imported_ngrams_dbdata
)
n_total_ng
=
len
(
imported_unique_ngramstrs
)
# prepare data format
imported_ngrams_dbdata
=
[]
for
ngram_str
in
imported_unique_ngramstrs
:
# DB needs the number of separate words
n_words
=
1
+
len
(
findall
(
r' '
,
ngram_str
))
imported_ngrams_dbdata
.
append
((
ngram_str
,
n_words
))
# returns a dict {term => id} and a count of inserted ones
# returns a dict {term => id} and a count of inserted ones
(
new_ngrams_ids
,
n_added_ng
)
=
bulk_insert_ifnotexists
(
(
new_ngrams_ids
,
n_added_ng
)
=
bulk_insert_ifnotexists
(
...
@@ -464,15 +554,11 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
...
@@ -464,15 +554,11 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
)
)
del
imported_ngrams_dbdata
del
imported_ngrams_dbdata
# loop on old ngrams and create direct mapping old_id => new_id
# new_ngrams_ids contains a direct mapping ng_str => new_id
old_to_new_id_map
=
{}
del
imported_unique_ngramstrs
for
term
,
oldid
in
imported_ngrams_oldids
.
items
():
old_to_new_id_map
[
oldid
]
=
new_ngrams_ids
[
term
]
del
new_ngrams_ids
del
imported_ngrams_oldids
# print(
old_to_new_id_map
)
# print(
new_ngrams_ids
)
# print(import_nodes_ngrams)
# print(import
ed
_nodes_ngrams)
# ======== Import into lists =========
# ======== Import into lists =========
# 3 x abstract lists + 1 translations
# 3 x abstract lists + 1 translations
...
@@ -483,27 +569,26 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
...
@@ -483,27 +569,26 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
'groupings'
:
Translations
()
'groupings'
:
Translations
()
}
}
for
list_type
in
import_nodes_ngrams
:
for
list_type
in
import
ed
_nodes_ngrams
:
for
old_id
in
import
_nodes_ngrams
[
list_type
]:
for
ng_str
in
imported
_nodes_ngrams
[
list_type
]:
new_id
=
old_to_new_id_map
[
old_id
]
new_id
=
new_ngrams_ids
[
ng_str
]
# add to the abstract list
# add to the abstract list
result
[
list_type
]
.
items
.
add
(
new_id
)
result
[
list_type
]
.
items
.
add
(
new_id
)
# for main also add map elements
# for main also add map elements
if
list_type
==
'main'
:
if
list_type
==
'main'
:
for
old_id
in
import
_nodes_ngrams
[
'map'
]:
for
ng_str
in
imported
_nodes_ngrams
[
'map'
]:
new_id
=
old_to_new_id_map
[
old_id
]
new_id
=
new_ngrams_ids
[
ng_str
]
result
[
'main'
]
.
items
.
add
(
new_id
)
result
[
'main'
]
.
items
.
add
(
new_id
)
# ======== Synonyms =========
# ======== Synonyms =========
for
(
x
,
y
)
in
imported_groupings
:
for
(
x_str
,
y_str
)
in
imported_groupings
:
if
(
x
not
in
ignored_oldids
)
and
(
y
not
in
ignored_oldids
):
new_mainform_id
=
new_ngrams_ids
[
x_str
]
new_mainform_id
=
old_to_new_id_map
[
x
]
new_subform_id
=
new_ngrams_ids
[
y_str
]
new_subform_id
=
old_to_new_id_map
[
y
]
# /!\ Translations use (subform => mainform) order
# /!\ Translations use (subform => mainform) order
result
[
'groupings'
]
.
items
[
new_subform_id
]
=
new_mainform_id
result
[
'groupings'
]
.
items
[
new_subform_id
]
=
new_mainform_id
n_group_relations
+=
1
n_group_relations
+=
1
# ------------------------------------------------------------------
# ------------------------------------------------------------------
print
(
"IMPORT: read
%
i lines from the CSV"
%
n_read_lines
)
print
(
"IMPORT: read
%
i lines from the CSV"
%
n_read_lines
)
...
@@ -511,6 +596,7 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
...
@@ -511,6 +596,7 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
%
(
n_total_ng
,
n_added_ng
,
n_total_ng
-
n_added_ng
)
)
%
(
n_total_ng
,
n_added_ng
,
n_total_ng
-
n_added_ng
)
)
print
(
"IMPORT: read
%
i grouping relations"
%
n_group_relations
)
print
(
"IMPORT: read
%
i grouping relations"
%
n_group_relations
)
# print("IMPORT RESULT", result)
return
result
return
result
...
...
gargantext/views/api/ngramlists.py
View file @
75d4a738
...
@@ -66,29 +66,42 @@ class CSVLists(APIView):
...
@@ -66,29 +66,42 @@ class CSVLists(APIView):
/!
\
We assume we checked the file size client-side before upload
/!
\
We assume we checked the file size client-side before upload
£TODO check authentication and user.id
"""
"""
if
not
request
.
user
.
is_authenticated
():
res
=
HttpResponse
(
"Unauthorized"
)
res
.
status_code
=
401
return
res
# this time the corpus param is the one with the target lists to be patched
# this time the corpus param is the one with the target lists to be patched
params
=
get_parameters
(
request
)
params
=
get_parameters
(
request
)
corpus_id
=
int
(
params
.
pop
(
"onto_corpus"
))
corpus_id
=
int
(
params
.
pop
(
"onto_corpus"
))
corpus_node
=
cache
.
Node
[
corpus_id
]
corpus_node
=
cache
.
Node
[
corpus_id
]
if
request
.
user
.
id
!=
corpus_node
.
user_id
:
res
=
HttpResponse
(
"Unauthorized"
)
res
.
status_code
=
401
return
res
# request also contains the file
# request also contains the file
# csv_file has type django.core.files.uploadedfile.InMemoryUploadedFile
# csv_file has type django.core.files.uploadedfile.InMemoryUploadedFile
# ----------------------
# ----------------------
csv_file
=
request
.
data
[
'csvfile'
]
csv_file
=
request
.
data
[
'csvfile'
]
# import the csv
# import the csv
new_lists
=
import_ngramlists
(
csv_file
)
try
:
del
csv_file
new_lists
=
import_ngramlists
(
csv_file
)
del
csv_file
# merge the new_lists onto those of the target corpus
log_msg
=
merge_ngramlists
(
new_lists
,
onto_corpus
=
corpus_node
)
# merge the new_lists onto those of the target corpus
log_msg
=
merge_ngramlists
(
new_lists
,
onto_corpus
=
corpus_node
)
return
JsonHttpResponse
({
return
JsonHttpResponse
({
'log'
:
log_msg
,
'log'
:
log_msg
,
},
200
)
},
200
)
except
Exception
as
e
:
return
JsonHttpResponse
({
'err'
:
str
(
e
),
},
400
)
...
...
static/lib/gargantext/tables.css
View file @
75d4a738
...
@@ -87,7 +87,7 @@ p.note > label {
...
@@ -87,7 +87,7 @@ p.note > label {
opacity
:
0.3
;
opacity
:
0.3
;
}
}
tr
:hover
{
#my-ajax-table
tr
:hover
{
cursor
:
pointer
;
cursor
:
pointer
;
font-weight
:
bold
;
font-weight
:
bold
;
}
}
...
...
templates/pages/corpora/terms.html
View file @
75d4a738
...
@@ -7,6 +7,53 @@
...
@@ -7,6 +7,53 @@
<link
rel=
"stylesheet"
type=
"text/css"
href=
"{% static "
lib
/
gargantext
/
tables
.
css
"%}"
/>
<link
rel=
"stylesheet"
type=
"text/css"
href=
"{% static "
lib
/
gargantext
/
tables
.
css
"%}"
/>
<link
rel=
"stylesheet"
type=
"text/css"
href=
"{% static "
lib
/
gargantext
/
charts
.
css
"%}"
/>
<link
rel=
"stylesheet"
type=
"text/css"
href=
"{% static "
lib
/
gargantext
/
charts
.
css
"%}"
/>
<style>
#formatinfos-announce
{
font-size
:
12px
;
padding-top
:
.5em
;
}
#formatinfos
{
background-color
:
#CCC
;
font-size
:
12px
;
padding
:
1em
;
border-radius
:
1em
;
margin
:
1.5em
;
}
#formatinfos
table
{
margin-left
:
2.5em
;
margin-bottom
:
1em
;
margin-top
:
1em
;
}
#formatinfos
tr
:hover
{
font-weight
:
normal
}
#formatinfos
td
{
color
:
inherit
;
hover
:
none
;
}
#formatinfos
h4
{
font-size
:
14px
;
color
:
#777
;
font-weight
:
bold
;
}
#formatinfos
p
{
font-size
:
14px
;
}
#formatinfos
em
{
font-weight
:
bold
;
}
</style>
<script
type=
"text/javascript"
src=
"{% static "
lib
/
d3
/
d3
.
js
"%}"
></script>
<script
type=
"text/javascript"
src=
"{% static "
lib
/
d3
/
d3
.
js
"%}"
></script>
<script
type=
"text/javascript"
src=
"{% static "
lib
/
d3
/
crossfilter
.
js
"%}"
></script>
<script
type=
"text/javascript"
src=
"{% static "
lib
/
d3
/
crossfilter
.
js
"%}"
></script>
<script
type=
"text/javascript"
src=
"{% static "
lib
/
d3
/
dc
.
js
"%}"
></script>
<script
type=
"text/javascript"
src=
"{% static "
lib
/
d3
/
dc
.
js
"%}"
></script>
...
@@ -126,7 +173,7 @@
...
@@ -126,7 +173,7 @@
<div
class=
"modal-content"
>
<div
class=
"modal-content"
>
<div
class=
"modal-header"
>
<div
class=
"modal-header"
>
<button
type=
"button"
class=
"close"
data-dismiss=
"modal"
aria-hidden=
"true"
>
×
</button>
<button
type=
"button"
class=
"close"
data-dismiss=
"modal"
aria-hidden=
"true"
>
×
</button>
<h3
id=
"myModalLabel"
>
Import a
CSV term
list
</h3>
<h3
id=
"myModalLabel"
>
Import a
Term
list
</h3>
</div>
</div>
<div
class=
"modal-body"
id=
"uploadform"
>
<div
class=
"modal-body"
id=
"uploadform"
>
<form
id=
"csvimportform"
<form
id=
"csvimportform"
...
@@ -134,8 +181,36 @@
...
@@ -134,8 +181,36 @@
enctype=
"multipart/form-data"
enctype=
"multipart/form-data"
method=
"post"
>
method=
"post"
>
{% csrf_token %}
{% csrf_token %}
<label>
From your disk:
</label>
<label>
From
a CSV on
your disk:
</label>
<input
type=
"file"
id=
"csvfile"
accept=
"text/csv"
>
<input
type=
"file"
id=
"csvfile"
accept=
"text/csv"
>
<p
id=
"formatinfos-announce"
>
<span
id=
"formatinfos-icon"
class=
"glyphicon glyphicon-triangle-right"
onclick=
"toggleFormatInfos()"
></span>
More infos about CSV expected format
</p>
<div
id=
"formatinfos"
style=
"display:none;"
>
<h4>
Example table
</h4>
<table
class=
"table-condensed note"
>
<tr><th>
status
</th>
<th>
label
</th>
<th>
forms
</th></tr>
<tr><td>
map
</td>
<td>
barograph
</td>
<td></td></tr>
<tr><td>
map
</td>
<td>
seafaring
</td>
<td>
seafarer|
&
|ocean travel
</td></tr>
<tr><td>
main
</td>
<td>
electromagnetic
</td>
<td>
electro-magnetic
</td></tr>
</table>
<h4>
Remarks
</h4>
<ul>
<li>
Tabulation is the expected delimiter between columns.
</li>
<li>
The only mandatory column is
<em>
label
</em>
.
</li>
<li>
If
<em>
status
</em>
is absent, default target status is "map"
</li>
<li>
When a column is there, always add its header on the 1st line:
<em>
status
</em>
,
<em>
label
</em>
,
<em>
forms
</em></li>
<li>
The label will be always added as a form, even if it's not in 'forms' column
</li>
<li>
The string
<em>
|
&
|
</em>
(3 characters) is the expected delimiter between forms.
</li>
</ul>
</div>
<br/>
<br/>
<label>
From another corpus:
</label>
<label>
From another corpus:
</label>
<p>
TODO
</p>
<p>
TODO
</p>
...
@@ -153,6 +228,28 @@
...
@@ -153,6 +228,28 @@
<script
type=
"text/javascript"
src=
"{% static "
lib
/
gargantext
/
NGrams_dyna_chart_and_table
.
js
"
%}"
></script>
<script
type=
"text/javascript"
src=
"{% static "
lib
/
gargantext
/
NGrams_dyna_chart_and_table
.
js
"
%}"
></script>
<script
type=
"text/javascript"
>
<script
type=
"text/javascript"
>
var
formatInfosOpen
=
false
;
function
toggleFormatInfos
()
{
// when already open => we close
if
(
formatInfosOpen
)
{
// hide div
$
(
'#formatinfos'
).
hide
()
// change icon
$
(
'#formatinfos-icon'
)[
0
].
classList
.
remove
(
'glyphicon-triangle-bottom'
)
$
(
'#formatinfos-icon'
)[
0
].
classList
.
add
(
'glyphicon-triangle-right'
)
// toggle flag
formatInfosOpen
=
false
;
}
else
{
// opposite case
$
(
'#formatinfos'
).
show
()
$
(
'#formatinfos-icon'
)[
0
].
classList
.
remove
(
'glyphicon-triangle-right'
)
$
(
'#formatinfos-icon'
)[
0
].
classList
.
add
(
'glyphicon-triangle-bottom'
)
formatInfosOpen
=
true
;
}
}
/* merci c24b !
/* merci c24b !
* Uses csvimportroute variable from the django template
* Uses csvimportroute variable from the django template
...
@@ -202,7 +299,7 @@ function postCSV(e){
...
@@ -202,7 +299,7 @@ function postCSV(e){
xhr
.
setRequestHeader
(
"X-CSRFToken"
,
getCookie
(
"csrftoken"
));
xhr
.
setRequestHeader
(
"X-CSRFToken"
,
getCookie
(
"csrftoken"
));
},
},
success
:
function
(
response
)
{
success
:
function
(
response
)
{
my_html
=
"<h2 color='green'>IMPORT OK ! </h2>"
my_html
=
'<h3 style="color:green">IMPORT OK</h3>'
my_html
+=
"<p class='note'>"
+
response
[
'log'
].
replace
(
/
\n
/g
,
'<br/>'
)
+
"</p>"
my_html
+=
"<p class='note'>"
+
response
[
'log'
].
replace
(
/
\n
/g
,
'<br/>'
)
+
"</p>"
my_html
+=
"<p'>(this page will reload in 3s)</p>"
my_html
+=
"<p'>(this page will reload in 3s)</p>"
$
(
'#formanswer'
).
html
(
my_html
);
$
(
'#formanswer'
).
html
(
my_html
);
...
@@ -211,7 +308,10 @@ function postCSV(e){
...
@@ -211,7 +308,10 @@ function postCSV(e){
setTimeout
(
"location.reload(true)"
,
3000
);
setTimeout
(
"location.reload(true)"
,
3000
);
},
},
error
:
function
(
result
)
{
error
:
function
(
result
)
{
$
(
'#formanswer'
).
html
(
'Erreur'
);
my_html
=
'<h3 style="color:red">Error</h3>'
my_html
+=
"<p class='note'>please correct your CSV file and retry</p>"
my_html
+=
"<p>"
+
result
.
responseJSON
[
'err'
]
+
"</p>"
$
(
'#formanswer'
).
html
(
my_html
);
console
.
error
(
result
);
console
.
error
(
result
);
},
},
});
});
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment