Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
e24efe96
Commit
e24efe96
authored
Sep 08, 2017
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Plain Diff
Merge remote-tracking branch 'origin/simon-unstable-lists-fix' into unstable
parents
06f55400
224eae66
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
170 additions
and
195 deletions
+170
-195
group_tools.py
gargantext/util/group_tools.py
+17
-14
lists.py
gargantext/util/lists.py
+3
-0
ngramlists_tools.py
gargantext/util/ngramlists_tools.py
+42
-31
CSV.py
gargantext/util/parsers/CSV.py
+51
-112
ngrams_extraction.py
gargantext/util/toolchain/ngrams_extraction.py
+38
-37
project.html
templates/pages/projects/project.html
+19
-1
No files found.
gargantext/util/group_tools.py
View file @
e24efe96
...
@@ -7,7 +7,7 @@ from gargantext.util.db import session, aliased
...
@@ -7,7 +7,7 @@ from gargantext.util.db import session, aliased
from
gargantext.models
import
Ngram
,
NodeNgramNgram
from
gargantext.models
import
Ngram
,
NodeNgramNgram
from
igraph
import
Graph
# for group_union
from
igraph
import
Graph
# for group_union
def
query_groups
(
groupings_id
,
details
=
False
):
def
query_groups
(
groupings_id
,
details
=
False
,
sort
=
False
):
"""
"""
Listing of couples (mainform, subform)
Listing of couples (mainform, subform)
aka (ngram1_id, ngram2_id)
aka (ngram1_id, ngram2_id)
...
@@ -15,24 +15,27 @@ def query_groups(groupings_id, details=False):
...
@@ -15,24 +15,27 @@ def query_groups(groupings_id, details=False):
Parameter:
Parameter:
- details: if False, just send the array of couples
- details: if False, just send the array of couples
if True, send quadruplets with (ngram1_id, term1, ngram2_id, term2)
if True, send quadruplets with (ngram1_id, term1, ngram2_id, term2)
- sort: order results by terms of ngram1 then ngram2
"""
"""
if
details
or
sort
:
Ngram1
,
Ngram2
=
Ngram
,
aliased
(
Ngram
)
if
not
details
:
if
not
details
:
# simple contents
# simple contents
query
=
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
)
columns
=
(
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
)
else
:
else
:
# detailed contents (id + terms)
# detailed contents (id + terms)
Ngram1
=
aliased
(
Ngram
)
columns
=
(
Ngram1
.
id
,
Ngram1
.
terms
,
Ngram2
=
aliased
(
Ngram
)
Ngram2
.
id
,
Ngram2
.
terms
)
query
=
(
session
.
query
(
query
=
session
.
query
(
*
columns
)
NodeNgramNgram
.
ngram1_id
,
Ngram1
.
terms
,
if
details
or
sort
:
NodeNgramNgram
.
ngram2_id
,
query
=
(
query
.
join
(
Ngram1
,
NodeNgramNgram
.
ngram1_id
==
Ngram1
.
id
)
Ngram2
.
terms
,
.
join
(
Ngram2
,
NodeNgramNgram
.
ngram2_id
==
Ngram2
.
id
))
)
.
join
(
Ngram1
,
NodeNgramNgram
.
ngram1_id
==
Ngram1
.
id
)
if
sort
:
.
join
(
Ngram2
,
NodeNgramNgram
.
ngram2_id
==
Ngram2
.
id
)
query
=
query
.
order_by
(
Ngram1
.
terms
,
Ngram2
.
terms
)
)
# main filter
# main filter
# -----------
# -----------
...
...
gargantext/util/lists.py
View file @
e24efe96
...
@@ -50,6 +50,9 @@ class _BaseClass:
...
@@ -50,6 +50,9 @@ class _BaseClass:
else
:
else
:
return
NotImplemented
return
NotImplemented
def
__len__
(
self
):
return
len
(
self
.
items
)
def
__repr__
(
self
):
def
__repr__
(
self
):
items
=
self
.
items
items
=
self
.
items
if
isinstance
(
items
,
defaultdict
):
if
isinstance
(
items
,
defaultdict
):
...
...
gargantext/util/ngramlists_tools.py
View file @
e24efe96
...
@@ -8,8 +8,7 @@ Tools to work with ngramlists (MAINLIST, MAPLIST, STOPLIST)
...
@@ -8,8 +8,7 @@ Tools to work with ngramlists (MAINLIST, MAPLIST, STOPLIST)
"""
"""
from
gargantext.util.group_tools
import
query_groups
,
group_union
from
gargantext.util.group_tools
import
query_groups
,
group_union
from
gargantext.util.db
import
session
,
desc
,
func
,
\
from
gargantext.util.db
import
session
,
bulk_insert_ifnotexists
bulk_insert_ifnotexists
from
gargantext.models
import
Ngram
,
NodeNgram
,
NodeNodeNgram
,
\
from
gargantext.models
import
Ngram
,
NodeNgram
,
NodeNodeNgram
,
\
NodeNgramNgram
,
Node
NodeNgramNgram
,
Node
...
@@ -25,7 +24,6 @@ from gargantext.util.toolchain.ngrams_extraction import normalize_forms
...
@@ -25,7 +24,6 @@ from gargantext.util.toolchain.ngrams_extraction import normalize_forms
# merge will also index the new ngrams in the docs of the corpus
# merge will also index the new ngrams in the docs of the corpus
from
gargantext.util.toolchain.ngrams_addition
import
index_new_ngrams
from
gargantext.util.toolchain.ngrams_addition
import
index_new_ngrams
from
sqlalchemy.sql
import
exists
from
os
import
path
from
os
import
path
from
csv
import
writer
,
reader
,
QUOTE_MINIMAL
from
csv
import
writer
,
reader
,
QUOTE_MINIMAL
from
collections
import
defaultdict
from
collections
import
defaultdict
...
@@ -35,8 +33,8 @@ from celery import shared_task
...
@@ -35,8 +33,8 @@ from celery import shared_task
def
query_list
(
list_id
,
def
query_list
(
list_id
,
pagination_limit
=
None
,
pagination_offset
=
None
,
pagination_limit
=
None
,
pagination_offset
=
None
,
details
=
False
,
scoring_metric_id
=
None
,
groupings_id
=
None
details
=
False
,
scoring_metric_id
=
None
,
groupings_id
=
None
,
):
sort
=
False
):
"""
"""
Paginated listing of ngram_ids in a NodeNgram lists.
Paginated listing of ngram_ids in a NodeNgram lists.
...
@@ -51,6 +49,7 @@ def query_list(list_id,
...
@@ -51,6 +49,7 @@ def query_list(list_id,
(for details and sorting)
(for details and sorting)
- groupings_id: optional id of a list of grouping relations (synonyms)
- groupings_id: optional id of a list of grouping relations (synonyms)
(each synonym will be added to the list if not already in there)
(each synonym will be added to the list if not already in there)
- sort: order by Ngram.terms (not possible if details is False)
FIXME: subforms appended recently and not generalized enough
FIXME: subforms appended recently and not generalized enough
=> add a common part for all "if groupings_id"
=> add a common part for all "if groupings_id"
...
@@ -114,7 +113,10 @@ def query_list(list_id,
...
@@ -114,7 +113,10 @@ def query_list(list_id,
query
=
query
.
limit
(
pagination_limit
)
query
=
query
.
limit
(
pagination_limit
)
if
pagination_offset
:
if
pagination_offset
:
query
=
query
.
offset
(
pagination_offsets
)
query
=
query
.
offset
(
pagination_offset
)
if
details
and
sort
:
query
=
query
.
order_by
(
Ngram
.
terms
)
return
query
return
query
...
@@ -175,9 +177,7 @@ def ngrams_to_csv_rows(ngram_objs, ngram_dico={}, group_infos={},
...
@@ -175,9 +177,7 @@ def ngrams_to_csv_rows(ngram_objs, ngram_dico={}, group_infos={},
# 3 columns = |status, | mainform, | forms
# 3 columns = |status, | mainform, | forms
# (type_of_list) ( term ) ( subterm1|&|subterm2 )
# (type_of_list) ( term ) ( subterm1|&|subterm2 )
csv_rows
.
append
(
csv_rows
.
append
([
list_type
,
ng_obj
.
terms
,
this_grouped_terms
])
[
list_type
,
ng_obj
.
terms
,
this_grouped_terms
]
)
return
csv_rows
return
csv_rows
...
@@ -220,9 +220,10 @@ def export_ngramlists(node,fname=None,delimiter=DEFAULT_CSV_DELIM,titles=True):
...
@@ -220,9 +220,10 @@ def export_ngramlists(node,fname=None,delimiter=DEFAULT_CSV_DELIM,titles=True):
# listes de ngram_ids correspondantes
# listes de ngram_ids correspondantes
# ------------------------------------
# ------------------------------------
# contenu: liste des objets ngrammes [(2562,"monterme",1),...]
# contenu: liste des objets ngrammes [(2562,"monterme",1),...]
stop_ngrams
=
query_list
(
stoplist_node
.
id
,
details
=
True
,
groupings_id
=
group_node
.
id
)
.
all
()
stop_ngrams
,
main_ngrams
,
map_ngrams
=
(
main_ngrams
=
query_list
(
mainlist_node
.
id
,
details
=
True
,
groupings_id
=
group_node
.
id
)
.
all
()
query_list
(
n
.
id
,
details
=
True
,
groupings_id
=
group_node
.
id
,
sort
=
True
)
.
all
()
map_ngrams
=
query_list
(
maplist_node
.
id
,
details
=
True
,
groupings_id
=
group_node
.
id
)
.
all
()
for
n
in
(
stoplist_node
,
mainlist_node
,
maplist_node
)
)
# pour debug ---------->8 --------------------
# pour debug ---------->8 --------------------
#~ stop_ngrams = stop_ngrams[0:10]
#~ stop_ngrams = stop_ngrams[0:10]
...
@@ -239,7 +240,7 @@ def export_ngramlists(node,fname=None,delimiter=DEFAULT_CSV_DELIM,titles=True):
...
@@ -239,7 +240,7 @@ def export_ngramlists(node,fname=None,delimiter=DEFAULT_CSV_DELIM,titles=True):
# for the groups we got couples of ids in the DB
# for the groups we got couples of ids in the DB
# -------------------
# -------------------
# ex: [(3544, 2353), (2787, 4032), ...]
# ex: [(3544, 2353), (2787, 4032), ...]
group_ngram_id_couples
=
query_groups
(
group_node
.
id
)
.
all
(
)
group_ngram_id_couples
=
query_groups
(
group_node
.
id
,
sort
=
True
)
# we expend this to double structure for groups lookup
# we expend this to double structure for groups lookup
# 1) g['links'] = k couples (x,y_i) as a set [x => {y1,y2}]
# 1) g['links'] = k couples (x,y_i) as a set [x => {y1,y2}]
...
@@ -386,6 +387,9 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
...
@@ -386,6 +387,9 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
NB: To merge the imported lists into a corpus node's lists,
NB: To merge the imported lists into a corpus node's lists,
chain this function with merge_ngramlists()
chain this function with merge_ngramlists()
'''
'''
list_types
=
[
'stop'
,
'main'
,
'map'
]
# ---------------
# ---------------
# ngram storage
# ngram storage
# ---------------
# ---------------
...
@@ -450,7 +454,6 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
...
@@ -450,7 +454,6 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
# headers
# headers
if
i
==
0
:
if
i
==
0
:
n_cols
=
len
(
csv_row
)
for
j
,
colname
in
enumerate
(
csv_row
):
for
j
,
colname
in
enumerate
(
csv_row
):
if
colname
in
[
'label'
,
'status'
,
'forms'
]:
if
colname
in
[
'label'
,
'status'
,
'forms'
]:
columns
[
colname
]
=
j
columns
[
colname
]
=
j
...
@@ -497,31 +500,30 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
...
@@ -497,31 +500,30 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
continue
continue
# --- check correct list type
# --- check correct list type
if
not
this_list_type
in
[
'stop'
,
'main'
,
'map'
]
:
if
not
this_list_type
in
list_types
:
print
(
"IMPORT WARN: (skip line) wrong list type at CSV
%
s:l.
%
i"
%
(
fname
,
i
))
print
(
"IMPORT WARN: (skip line) wrong list type at CSV
%
s:l.
%
i"
%
(
fname
,
i
))
continue
continue
# subforms can be duplicated (in forms and another label)
# subforms can be duplicated (in forms and another label)
# but we must take care of unwanted other duplicates too
# but we must take care of unwanted other duplicates too
if
this_row_label
in
imported_unique_ngramstrs
:
if
imported_unique_ngramstrs
.
get
(
this_row_label
)
==
1
:
print
(
"TODO IMPORT DUPL: (skip line) term appears more than once at CSV
%
s:l.
%
i"
print
(
"TODO IMPORT DUPL: (skip line) term
%
r
appears more than once at CSV
%
s:l.
%
i"
%
(
fname
,
i
))
%
(
this_row_label
,
fname
,
i
))
# ================= Store the data ====================
# ================= Store the data ====================
# the ngram census
# the ngram census
imported_unique_ngramstrs
[
this_row_label
]
=
True
imported_unique_ngramstrs
[
this_row_label
]
=
1
# and the "list to ngram" relation
# and the "list to ngram" relation
imported_nodes_ngrams
[
this_list_type
]
.
append
(
this_row_label
)
imported_nodes_ngrams
[
this_list_type
]
.
append
(
this_row_label
)
# ====== Store synonyms from the import (if any) ======
# ====== Store synonyms from the import (if any) ======
if
len
(
this_row_forms
)
!=
0
:
if
len
(
this_row_forms
)
!=
0
:
other_terms
=
[]
for
raw_term_str
in
this_row_forms
.
split
(
group_delimiter
):
for
raw_term_str
in
this_row_forms
.
split
(
group_delimiter
):
# each subform is also like an ngram declaration
# each subform is also like an ngram declaration
term_str
=
normalize_forms
(
normalize_chars
(
raw_term_str
))
term_str
=
normalize_forms
(
normalize_chars
(
raw_term_str
))
imported_unique_ngramstrs
[
term_str
]
=
True
imported_unique_ngramstrs
[
term_str
]
=
2
imported_nodes_ngrams
[
this_list_type
]
.
append
(
term_str
)
imported_nodes_ngrams
[
this_list_type
]
.
append
(
term_str
)
# the optional repeated mainform doesn't interest us
# the optional repeated mainform doesn't interest us
...
@@ -599,7 +601,10 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
...
@@ -599,7 +601,10 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
%
(
n_total_ng
,
n_added_ng
,
n_total_ng
-
n_added_ng
)
)
%
(
n_total_ng
,
n_added_ng
,
n_total_ng
-
n_added_ng
)
)
print
(
"IMPORT: read
%
i grouping relations"
%
n_group_relations
)
print
(
"IMPORT: read
%
i grouping relations"
%
n_group_relations
)
# print("IMPORT RESULT", result)
list_counts
=
[(
typ
,
len
(
result
.
get
(
typ
)))
for
typ
in
list_types
]
list_counts
.
append
((
'total'
,
sum
(
x
[
1
]
for
x
in
list_counts
)))
print
(
"IMPORT: "
+
'; '
.
join
(
'
%
s
%
s'
%
stats
for
stats
in
list_counts
))
return
result
return
result
def
merge_ngramlists
(
new_lists
=
{},
onto_corpus
=
None
,
del_originals
=
[]):
def
merge_ngramlists
(
new_lists
=
{},
onto_corpus
=
None
,
del_originals
=
[]):
...
@@ -707,9 +712,11 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
...
@@ -707,9 +712,11 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
# ======== Merging all involved ngrams =========
# ======== Merging all involved ngrams =========
# all memberships with resolved conflicts of interfering memberships
# all ngram memberships with resolved conflicts of interfering memberships
# (associates ngram ids with list types -- see linfos definition above)
resolved_memberships
=
{}
resolved_memberships
=
{}
# iterates over each ngram of each list type for both old and new lists
for
list_set
in
[
old_lists
,
new_lists
]:
for
list_set
in
[
old_lists
,
new_lists
]:
for
lid
,
info
in
enumerate
(
linfos
):
for
lid
,
info
in
enumerate
(
linfos
):
list_type
=
info
[
'key'
]
list_type
=
info
[
'key'
]
...
@@ -739,11 +746,11 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
...
@@ -739,11 +746,11 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
# ======== Merging old and new groups =========
# ======== Merging old and new groups =========
# get the arcs already in the target DB (directed couples)
# get the arcs already in the target DB (directed couples)
previous_links
=
session
.
query
(
previous_links
=
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
NodeNgramNgram
.
ngram2_id
)
.
filter
(
)
.
filter
(
NodeNgramNgram
.
node_id
==
old_group_id
NodeNgramNgram
.
node_id
==
old_group_id
)
.
all
()
)
.
all
()
n_links_previous
=
len
(
previous_links
)
n_links_previous
=
len
(
previous_links
)
...
@@ -811,7 +818,7 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
...
@@ -811,7 +818,7 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
list_type
=
linfos
[
lid
][
'key'
]
list_type
=
linfos
[
lid
][
'key'
]
merged_results
[
list_type
]
.
items
.
add
(
ng_id
)
merged_results
[
list_type
]
.
items
.
add
(
ng_id
)
#
print("IMPORT: added %i elements in the lists indices" % added_nd_ng)
print
(
"IMPORT: added
%
i elements in the lists indices"
%
added_nd_ng
)
# ======== Overwrite old data with new =========
# ======== Overwrite old data with new =========
for
lid
,
info
in
enumerate
(
linfos
):
for
lid
,
info
in
enumerate
(
linfos
):
...
@@ -834,10 +841,14 @@ def import_and_merge_ngramlists(file_contents, onto_corpus_id, overwrite=False):
...
@@ -834,10 +841,14 @@ def import_and_merge_ngramlists(file_contents, onto_corpus_id, overwrite=False):
"""
"""
A single function to run import_ngramlists and merge_ngramlists together
A single function to run import_ngramlists and merge_ngramlists together
"""
"""
print
(
"import list"
)
print
(
"IMPORT CSV termlists file with
%
s lines in corpus
%
s (
%
s)"
%
(
len
(
file_contents
),
onto_corpus_id
,
'overwrite'
if
overwrite
else
'merge'
))
new_lists
=
import_ngramlists
(
file_contents
)
new_lists
=
import_ngramlists
(
file_contents
)
corpus_node
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
onto_corpus_id
)
.
first
(
)
corpus_node
=
session
.
query
(
Node
)
.
get
(
onto_corpus_id
)
# merge the new_lists onto those of the target corpus
# merge the new_lists onto those of the target corpus
del_originals
=
[
'stop'
,
'main'
,
'map'
]
if
overwrite
else
[]
del_originals
=
[
'stop'
,
'main'
,
'map'
]
if
overwrite
else
[]
...
...
gargantext/util/parsers/CSV.py
View file @
e24efe96
...
@@ -4,128 +4,67 @@ import sys
...
@@ -4,128 +4,67 @@ import sys
import
csv
import
csv
csv
.
field_size_limit
(
sys
.
maxsize
)
csv
.
field_size_limit
(
sys
.
maxsize
)
import
numpy
as
np
import
numpy
as
np
import
os
class
CSVParser
(
Parser
):
class
CSVParser
(
Parser
):
DELIMITERS
=
",
\t
;|:"
def
CSVsample
(
self
,
small_contents
,
delim
)
:
def
detect_delimiter
(
self
,
lines
,
sample_size
=
10
)
:
reader
=
csv
.
reader
(
small_contents
,
delimiter
=
delim
)
sample
=
lines
[:
sample_size
]
Freqs
=
[]
# Compute frequency of each delimiter on each input line
for
row
in
reader
:
delimiters_freqs
=
{
Freqs
.
append
(
len
(
row
))
d
:
[
line
.
count
(
d
)
for
line
in
sample
]
for
d
in
self
.
DELIMITERS
}
return
Freqs
# Select delimiters with a standard deviation of zero, ie. delimiters
# for which we have the same number of fields on each line
selected_delimiters
=
[
(
d
,
np
.
sum
(
freqs
))
for
d
,
freqs
in
delimiters_freqs
.
items
()
if
any
(
freqs
)
and
np
.
std
(
freqs
)
==
0
]
if
selected_delimiters
:
# Choose the delimiter with highest frequency amongst selected ones
sorted_delimiters
=
sorted
(
selected_delimiters
,
key
=
lambda
x
:
x
[
1
])
return
sorted_delimiters
[
-
1
][
0
]
def
parse
(
self
,
filebuf
):
def
parse
(
self
,
filebuf
):
print
(
"CSV: parsing (assuming UTF-8 and LF line endings)"
)
print
(
"CSV: parsing (assuming UTF-8 and LF line endings)"
)
contents
=
filebuf
.
read
()
.
decode
(
"UTF-8"
)
.
split
(
"
\n
"
)
contents
=
filebuf
.
read
()
.
decode
(
"UTF-8"
)
.
split
(
"
\n
"
)
sample_size
=
10
# Filter out empty lines
sample_contents
=
contents
[
0
:
sample_size
]
contents
=
[
line
for
line
in
contents
if
line
.
strip
()]
hyperdata_list
=
[]
# Delimiter auto-detection
delimiter
=
self
.
detect_delimiter
(
contents
,
sample_size
=
10
)
# # = = = = [ Getting delimiters frequency ] = = = = #
PossibleDelimiters
=
[
','
,
' '
,
'
\t
'
,
';'
,
'|'
,
':'
]
if
delimiter
is
None
:
AllDelimiters
=
{}
raise
ValueError
(
"CSV: couldn't detect delimiter, bug or malformed data"
)
for
delim
in
PossibleDelimiters
:
AllDelimiters
[
delim
]
=
self
.
CSVsample
(
sample_contents
,
delim
)
print
(
"CSV: selected delimiter:
%
r"
%
delimiter
)
# # = = = = [ / Getting delimiters frequency ] = = = = #
# # OUTPUT example:
# Parse CSV
# # AllDelimiters = {
reader
=
csv
.
reader
(
contents
,
delimiter
=
delimiter
)
# # '\t': [1, 1, 1, 1, 1],
# # ' ': [1, 13, 261, 348, 330],
# Get first not empty row and its fields (ie. header row), or (0, [])
# # ',': [15, 15, 15, 15, 15],
first_row
,
headers
=
\
# # ';': [1, 1, 1, 1, 1],
next
(((
i
,
fields
)
for
i
,
fields
in
enumerate
(
reader
)
if
any
(
fields
)),
# # '|': [1, 1, 1, 1, 1]
(
0
,
[]))
# # }
# Get first not empty column of the first row, or 0
# # = = = = [ Stand.Dev=0 & Sum of delimiters ] = = = = #
first_col
=
next
((
i
for
i
,
field
in
enumerate
(
headers
)
if
field
),
0
)
Delimiters
=
[]
for
d
in
AllDelimiters
:
# Strip out potential empty fields in headers
freqs
=
AllDelimiters
[
d
]
headers
=
headers
[
first_col
:]
suma
=
np
.
sum
(
freqs
)
if
suma
>
0
:
std
=
np
.
std
(
freqs
)
# print [ d , suma , len(freqs) , std]
if
std
==
0
:
Delimiters
.
append
(
[
d
,
suma
,
len
(
freqs
)
,
std
]
)
# # = = = = [ / Stand.Dev=0 & Sum of delimiters ] = = = = #
# # OUTPUT example:
# # Delimiters = [
# # ['\t', 5, 5, 0.0],
# # [',', 75, 5, 0.0],
# # ['|', 5, 5, 0.0]
# # ]
# # = = = = [ Delimiter selection ] = = = = #
Sorted_Delims
=
sorted
(
Delimiters
,
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
HighestDelim
=
Sorted_Delims
[
0
][
0
]
# HighestDelim = ","
print
(
"CSV selected delimiter:"
,[
HighestDelim
])
# # = = = = [ / Delimiter selection ] = = = = #
# # = = = = [ First data coordinate ] = = = = #
Coords
=
{
"row"
:
-
1
,
"column"
:
-
1
}
reader
=
csv
.
reader
(
contents
,
delimiter
=
HighestDelim
)
# Return a generator of dictionaries with column labels as keys,
# filtering out empty rows
for
rownum
,
tokens
in
enumerate
(
reader
):
for
i
,
fields
in
enumerate
(
reader
):
if
rownum
%
250
==
0
:
if
i
%
500
==
0
:
print
(
"CSV row: "
,
rownum
)
print
(
"CSV: parsing row #
%
s..."
%
(
i
+
1
))
joined_tokens
=
""
.
join
(
tokens
)
if
any
(
fields
):
if
Coords
[
"row"
]
<
0
and
len
(
joined_tokens
)
>
0
:
yield
dict
(
zip
(
headers
,
fields
[
first_col
:]))
Coords
[
"row"
]
=
rownum
for
columnum
in
range
(
len
(
tokens
)):
t
=
tokens
[
columnum
]
if
len
(
t
)
>
0
:
Coords
[
"column"
]
=
columnum
break
# # = = = = [ / First data coordinate ] = = = = #
# # = = = = [ Setting Headers ] = = = = #
Headers_Int2Str
=
{}
reader
=
csv
.
reader
(
contents
,
delimiter
=
HighestDelim
)
for
rownum
,
tokens
in
enumerate
(
reader
):
if
rownum
>=
Coords
[
"row"
]:
for
columnum
in
range
(
Coords
[
"column"
],
len
(
tokens
)
):
t
=
tokens
[
columnum
]
Headers_Int2Str
[
columnum
]
=
t
break
# print("Headers_Int2Str")
# print(Headers_Int2Str)
# # = = = = [ / Setting Headers ] = = = = #
# # OUTPUT example:
# # Headers_Int2Str = {
# # 0: 'publication_date',
# # 1: 'publication_month',
# # 2: 'publication_second',
# # 3: 'abstract'
# # }
# # = = = = [ Reading the whole CSV and saving ] = = = = #
hyperdata_list
=
[]
reader
=
csv
.
reader
(
contents
,
delimiter
=
HighestDelim
)
for
rownum
,
tokens
in
enumerate
(
reader
):
if
rownum
>
Coords
[
"row"
]:
RecordDict
=
{}
for
columnum
in
range
(
Coords
[
"column"
],
len
(
tokens
)
):
data
=
tokens
[
columnum
]
RecordDict
[
Headers_Int2Str
[
columnum
]
]
=
data
if
len
(
RecordDict
.
keys
())
>
0
:
hyperdata_list
.
append
(
RecordDict
)
# # = = = = [ / Reading the whole CSV and saving ] = = = = #
return
hyperdata_list
gargantext/util/toolchain/ngrams_extraction.py
View file @
e24efe96
...
@@ -81,44 +81,45 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
...
@@ -81,44 +81,45 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
corpus
.
hyperdata
[
"skipped_docs"
]
.
append
(
document
.
id
)
corpus
.
hyperdata
[
"skipped_docs"
]
.
append
(
document
.
id
)
corpus
.
save_hyperdata
()
corpus
.
save_hyperdata
()
continue
continue
else
:
# ready !
# ready !
tagger
=
tagger_bots
[
language_iso2
]
tagger
=
tagger_bots
[
language_iso2
]
# to do verify if document has no KEYS to index
# to do verify if document has no KEYS to index
# eg: use set intersect (+ loop becomes direct! with no continue)
# eg: use set intersect (+ loop becomes direct! with no continue)
for
key
in
keys
:
for
key
in
keys
:
try
:
try
:
value
=
document
.
hyperdata
[
str
(
key
)]
value
=
document
.
hyperdata
[
str
(
key
)]
if
not
isinstance
(
value
,
str
):
if
not
isinstance
(
value
,
str
):
#print("DBG wrong content in doc for key", key)
#print("DBG wrong content in doc for key", key)
continue
# get ngrams
for
ngram
in
tagger
.
extract
(
value
):
tokens
=
tuple
(
normalize_forms
(
token
[
0
])
for
token
in
ngram
)
if
do_subngrams
:
# ex tokens = ["very", "cool", "exemple"]
# subterms = [['very', 'cool'],...]
subterms
=
subsequences
(
tokens
)
else
:
subterms
=
[
tokens
]
for
seqterm
in
subterms
:
ngram
=
' '
.
join
(
seqterm
)
nbwords
=
len
(
seqterm
)
nbchars
=
len
(
ngram
)
if
nbchars
>
1
:
if
nbchars
>
255
:
# max ngram length (DB constraint)
ngram
=
ngram
[:
255
]
# doc <=> ngram index
nodes_ngrams_count
[(
document
.
id
,
ngram
)]
+=
1
# add fields : terms n
ngrams_data
.
add
((
ngram
,
nbwords
,
))
except
:
#value not in doc
continue
continue
# get ngrams
for
ngram
in
tagger
.
extract
(
value
):
normal_forms
=
(
normalize_forms
(
t
[
0
])
for
t
in
ngram
)
tokens
=
tuple
(
nf
for
nf
in
normal_forms
if
nf
)
if
do_subngrams
:
# ex tokens = ["very", "cool", "exemple"]
# subterms = [['very', 'cool'],...]
subterms
=
subsequences
(
tokens
)
else
:
subterms
=
[
tokens
]
for
seqterm
in
subterms
:
ngram
=
' '
.
join
(
seqterm
)
nbwords
=
len
(
seqterm
)
nbchars
=
len
(
ngram
)
if
nbchars
>
1
:
if
nbchars
>
255
:
# max ngram length (DB constraint)
ngram
=
ngram
[:
255
]
# doc <=> ngram index
nodes_ngrams_count
[(
document
.
id
,
ngram
)]
+=
1
# add fields : terms n
ngrams_data
.
add
((
ngram
,
nbwords
,
))
except
:
#value not in doc
continue
# integrate ngrams and nodes-ngrams
# integrate ngrams and nodes-ngrams
if
len
(
nodes_ngrams_count
)
>=
BATCH_NGRAMSEXTRACTION_SIZE
:
if
len
(
nodes_ngrams_count
)
>=
BATCH_NGRAMSEXTRACTION_SIZE
:
...
...
templates/pages/projects/project.html
View file @
e24efe96
...
@@ -440,11 +440,12 @@
...
@@ -440,11 +440,12 @@
// in the form "Add a corpus"
// in the form "Add a corpus"
var
type
=
$
(
"#id_type"
).
val
()
var
type
=
$
(
"#id_type"
).
val
()
var
file
=
$
(
"#id_file"
).
val
()
// 5 booleans
// 5 booleans
var
nameField
=
$
(
"#id_name"
).
val
()
!=
""
var
nameField
=
$
(
"#id_name"
).
val
()
!=
""
var
typeField
=
(
type
!=
""
)
&&
(
type
!=
"0"
)
var
typeField
=
(
type
!=
""
)
&&
(
type
!=
"0"
)
var
fileField
=
$
(
"#id_file"
).
val
()
!=
""
var
fileField
=
file
!=
""
var
wantfileField
=
$
(
"#file_yes"
).
prop
(
"checked"
)
var
wantfileField
=
$
(
"#file_yes"
).
prop
(
"checked"
)
var
crawling
=
((
type
==
3
)
||
(
type
==
8
)
||
(
type
==
9
))
&&
!
wantfileField
var
crawling
=
((
type
==
3
)
||
(
type
==
8
)
||
(
type
==
9
))
&&
!
wantfileField
...
@@ -457,6 +458,23 @@
...
@@ -457,6 +458,23 @@
if
(
!
crawling
)
{
if
(
!
crawling
)
{
$
(
"#submit_thing"
).
prop
(
'disabled'
,
!
(
nameField
&&
typeField
&&
fileField
))
$
(
"#submit_thing"
).
prop
(
'disabled'
,
!
(
nameField
&&
typeField
&&
fileField
))
}
}
// Automatically select CSV when type is undefined
// and we have a .csv file
if
(
!
typeField
&&
file
&&
file
.
match
(
/.csv$/i
))
{
// Get CSV type id
var
csv
=
$
(
'#id_type > option'
)
.
filter
(
function
()
{
return
$
(
this
).
text
()
===
'CSV'
})
.
attr
(
'value'
)
// Select CSV type
$
(
'#id_type'
).
val
(
csv
)
// Focus on name field
setTimeout
(
function
()
{
$
(
"#id_name"
).
focus
()
})
}
}
}
function
bringDaNoise
()
{
function
bringDaNoise
()
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment