Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
87f75264
Commit
87f75264
authored
May 14, 2016
by
delanoe
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'romain-refactoring' into merge
parents
4362b85b
4c3aa4b9
Changes
6
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
301 additions
and
169 deletions
+301
-169
constants.py
gargantext/constants.py
+2
-2
__init__.py
gargantext/util/toolchain/__init__.py
+8
-6
list_main.py
gargantext/util/toolchain/list_main.py
+10
-10
metric_specificity.py
gargantext/util/toolchain/metric_specificity.py
+11
-8
metric_tfidf.py
gargantext/util/toolchain/metric_tfidf.py
+65
-27
NGrams_dyna_chart_and_table.js
static/lib/gargantext/NGrams_dyna_chart_and_table.js
+205
-116
No files found.
gargantext/constants.py
View file @
87f75264
...
@@ -194,9 +194,9 @@ RESOURCETYPES = [
...
@@ -194,9 +194,9 @@ RESOURCETYPES = [
]
]
# linguistic extraction parameters ---------------------------------------------
# linguistic extraction parameters ---------------------------------------------
DEFAULT_
TFIDF_CUTOFF_RATIO
=
.75
# MAINLIST maximum terms in %
DEFAULT_
RANK_CUTOFF_RATIO
=
.75
# MAINLIST maximum terms in %
DEFAULT_
TFIDF_HARD_LIMIT
=
5000
# MAINLIST maximum terms abs
DEFAULT_
RANK_HARD_LIMIT
=
5000
# MAINLIST maximum terms abs
# (makes COOCS larger ~ O(N²) /!\)
# (makes COOCS larger ~ O(N²) /!\)
DEFAULT_COOC_THRESHOLD
=
2
# inclusive minimum for COOCS coefs
DEFAULT_COOC_THRESHOLD
=
2
# inclusive minimum for COOCS coefs
...
...
gargantext/util/toolchain/__init__.py
View file @
87f75264
...
@@ -6,7 +6,7 @@ from .hyperdata_indexing import index_hyperdata
...
@@ -6,7 +6,7 @@ from .hyperdata_indexing import index_hyperdata
# in usual run order
# in usual run order
from
.list_stop
import
do_stoplist
from
.list_stop
import
do_stoplist
from
.metric_tfidf
import
compute_occs
,
compute_tfidf_local
,
compute_
cumulated_tfidf
from
.metric_tfidf
import
compute_occs
,
compute_tfidf_local
,
compute_
ti_ranking
from
.list_main
import
do_mainlist
from
.list_main
import
do_mainlist
from
.ngram_coocs
import
compute_coocs
from
.ngram_coocs
import
compute_coocs
from
.metric_specificity
import
compute_specificity
from
.metric_specificity
import
compute_specificity
...
@@ -116,13 +116,15 @@ def parse_extract_indexhyperdata(corpus):
...
@@ -116,13 +116,15 @@ def parse_extract_indexhyperdata(corpus):
ltfidf_id
=
compute_tfidf_local
(
corpus
)
ltfidf_id
=
compute_tfidf_local
(
corpus
)
print
(
'CORPUS #
%
d: [
%
s] new localtfidf node #
%
i'
%
(
corpus
.
id
,
t
(),
ltfidf_id
))
print
(
'CORPUS #
%
d: [
%
s] new localtfidf node #
%
i'
%
(
corpus
.
id
,
t
(),
ltfidf_id
))
# -> write global and cumulated tfidf to Node and NodeNodeNgram
# -> write global and cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram
gtfidf_id
=
compute_cumulated_tfidf
(
corpus
,
scope
=
"global"
)
tirank_id
=
compute_ti_ranking
(
corpus
,
print
(
'CORPUS #
%
d: [
%
s] new globaltfidf node #
%
i'
%
(
corpus
.
id
,
t
(),
gtfidf_id
))
count_scope
=
"global"
,
termset_scope
=
"local"
)
print
(
'CORPUS #
%
d: [
%
s] new tfidf ranking node #
%
i'
%
(
corpus
.
id
,
t
(),
tirank_id
))
# -> mainlist: filter + write (to Node and NodeNgram)
# -> mainlist: filter + write (to Node and NodeNgram)
mainlist_id
=
do_mainlist
(
corpus
,
mainlist_id
=
do_mainlist
(
corpus
,
tfidf_id
=
gtfidf
_id
,
ranking_scores_id
=
tirank
_id
,
stoplist_id
=
stop_id
)
stoplist_id
=
stop_id
)
print
(
'CORPUS #
%
d: [
%
s] new mainlist node #
%
i'
%
(
corpus
.
id
,
t
(),
mainlist_id
))
print
(
'CORPUS #
%
d: [
%
s] new mainlist node #
%
i'
%
(
corpus
.
id
,
t
(),
mainlist_id
))
...
@@ -143,7 +145,7 @@ def parse_extract_indexhyperdata(corpus):
...
@@ -143,7 +145,7 @@ def parse_extract_indexhyperdata(corpus):
print
(
'CORPUS #
%
d: [
%
s] new maplist node #
%
i'
%
(
corpus
.
id
,
t
(),
map_id
))
print
(
'CORPUS #
%
d: [
%
s] new maplist node #
%
i'
%
(
corpus
.
id
,
t
(),
map_id
))
print
(
'CORPUS #
%
d: [
%
s] FINISHED ngram lists computation'
%
(
corpus
.
id
,
t
()))
print
(
'CORPUS #
%
d: [
%
s] FINISHED ngram lists computation'
%
(
corpus
.
id
,
t
()))
corpus
.
status
(
'Lists'
,
progress
=
0
,
complete
=
True
)
corpus
.
status
(
'Lists'
,
progress
=
0
,
complete
=
True
)
corpus
.
save_hyperdata
()
corpus
.
save_hyperdata
()
session
.
commit
()
session
.
commit
()
...
...
gargantext/util/toolchain/list_main.py
View file @
87f75264
...
@@ -2,14 +2,14 @@ from gargantext.models import Node, NodeNgram, NodeNodeNgram
...
@@ -2,14 +2,14 @@ from gargantext.models import Node, NodeNgram, NodeNodeNgram
from
gargantext.util.db
import
session
from
gargantext.util.db
import
session
from
gargantext.util.lists
import
UnweightedList
from
gargantext.util.lists
import
UnweightedList
from
sqlalchemy
import
desc
from
sqlalchemy
import
desc
from
gargantext.constants
import
DEFAULT_
TFIDF
_CUTOFF_RATIO
,
\
from
gargantext.constants
import
DEFAULT_
RANK
_CUTOFF_RATIO
,
\
DEFAULT_
TFIDF
_HARD_LIMIT
DEFAULT_
RANK
_HARD_LIMIT
def
do_mainlist
(
corpus
,
def
do_mainlist
(
corpus
,
overwrite_id
=
None
,
overwrite_id
=
None
,
tfidf
_id
=
None
,
stoplist_id
=
None
,
ranking_scores
_id
=
None
,
stoplist_id
=
None
,
hard_limit
=
DEFAULT_
TFIDF
_HARD_LIMIT
,
hard_limit
=
DEFAULT_
RANK
_HARD_LIMIT
,
ratio_limit
=
DEFAULT_
TFIDF
_CUTOFF_RATIO
ratio_limit
=
DEFAULT_
RANK
_CUTOFF_RATIO
):
):
"""
"""
Select top n terms according to a global tfidf ranking and stoplist filter.
Select top n terms according to a global tfidf ranking and stoplist filter.
...
@@ -18,7 +18,7 @@ def do_mainlist(corpus,
...
@@ -18,7 +18,7 @@ def do_mainlist(corpus,
min(hard_limit, number_of_terms * ratio_limit)
min(hard_limit, number_of_terms * ratio_limit)
NB : We use a global tfidf node where the values are global but the ngrams
NB : We use a global tfidf node where the values are global but the ngrams
are already selected (
== only within this corpus document
s).
are already selected (
termset_scope == only within this corpus doc
s).
TO DISCUSS: allow influence of the local tfidf scores too
TO DISCUSS: allow influence of the local tfidf scores too
Parameters:
Parameters:
...
@@ -37,12 +37,12 @@ def do_mainlist(corpus,
...
@@ -37,12 +37,12 @@ def do_mainlist(corpus,
"""
"""
# retrieve helper nodes if not provided
# retrieve helper nodes if not provided
if
not
tfidf
_id
:
if
not
ranking_scores
_id
:
tfidf
_id
=
session
.
query
(
Node
.
id
)
.
filter
(
ranking_scores
_id
=
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
typename
==
"TFIDF-GLOBAL"
,
Node
.
typename
==
"TFIDF-GLOBAL"
,
Node
.
parent_id
==
corpus
.
id
Node
.
parent_id
==
corpus
.
id
)
.
first
()
)
.
first
()
if
not
tfidf
_id
:
if
not
ranking_scores
_id
:
raise
ValueError
(
"MAINLIST: TFIDF node needed for mainlist creation"
)
raise
ValueError
(
"MAINLIST: TFIDF node needed for mainlist creation"
)
if
not
stoplist_id
:
if
not
stoplist_id
:
...
@@ -64,7 +64,7 @@ def do_mainlist(corpus,
...
@@ -64,7 +64,7 @@ def do_mainlist(corpus,
# tfidf-ranked query
# tfidf-ranked query
ordered_filtered_tfidf
=
(
session
ordered_filtered_tfidf
=
(
session
.
query
(
NodeNodeNgram
.
ngram_id
)
.
query
(
NodeNodeNgram
.
ngram_id
)
.
filter
(
NodeNodeNgram
.
node1_id
==
tfidf
_id
)
.
filter
(
NodeNodeNgram
.
node1_id
==
ranking_scores
_id
)
.
filter
(
~
NodeNodeNgram
.
ngram_id
.
in_
(
stopterms_subquery
))
.
filter
(
~
NodeNodeNgram
.
ngram_id
.
in_
(
stopterms_subquery
))
.
order_by
(
desc
(
NodeNodeNgram
.
score
))
.
order_by
(
desc
(
NodeNodeNgram
.
score
))
)
)
...
...
gargantext/util/toolchain/metric_specificity.py
View file @
87f75264
...
@@ -44,11 +44,11 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
...
@@ -44,11 +44,11 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
# v = d.sum(axis=1) (- lui-même)
# v = d.sum(axis=1) (- lui-même)
xs
=
x
.
sum
(
axis
=
1
)
-
x
xs
=
x
.
sum
(
axis
=
1
)
-
x
ys
=
x
.
sum
(
axis
=
0
)
-
x
ys
=
x
.
sum
(
axis
=
0
)
-
x
# top inclus ou exclus
# top inclus ou exclus
#n = ( xs + ys) / (2 * (x.shape[0] - 1))
#n = ( xs + ys) / (2 * (x.shape[0] - 1))
# top generic or specific (asc is spec, desc is generic)
# top generic or specific (asc is spec, desc is generic)
v
=
(
xs
-
ys
)
/
(
2
*
(
x
.
shape
[
0
]
-
1
))
v
=
(
xs
-
ys
)
/
(
2
*
(
x
.
shape
[
0
]
-
1
))
...
@@ -105,11 +105,14 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
...
@@ -105,11 +105,14 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
# print(v)
# print(v)
pd
.
options
.
display
.
float_format
=
'${:,.2f}'
.
format
pd
.
options
.
display
.
float_format
=
'${:,.2f}'
.
format
data
=
WeightedList
(
if
not
v
.
empty
:
zip
(
v
.
index
.
tolist
()
data
=
WeightedList
(
,
v
.
values
.
tolist
()[
0
]
zip
(
v
.
index
.
tolist
()
)
,
v
.
values
.
tolist
()[
0
]
)
)
data
.
save
(
the_id
)
)
data
.
save
(
the_id
)
else
:
print
(
"WARNING: had no terms in COOCS => empty SPECIFICITY node"
)
return
(
the_id
)
return
(
the_id
)
gargantext/util/toolchain/metric_tfidf.py
View file @
87f75264
...
@@ -88,7 +88,7 @@ def compute_occs(corpus, overwrite_id = None):
...
@@ -88,7 +88,7 @@ def compute_occs(corpus, overwrite_id = None):
return
the_id
return
the_id
def
compute_
cumulated_tfidf
(
corpus
,
scope
=
"local"
,
overwrite_id
=
None
):
def
compute_
ti_ranking
(
corpus
,
count_scope
=
"local"
,
termset_
scope
=
"local"
,
overwrite_id
=
None
):
"""
"""
# TODO check if cumulated tfs correspond to app's use cases and intention
# TODO check if cumulated tfs correspond to app's use cases and intention
...
@@ -96,55 +96,93 @@ def compute_cumulated_tfidf(corpus, scope="local", overwrite_id=None):
...
@@ -96,55 +96,93 @@ def compute_cumulated_tfidf(corpus, scope="local", overwrite_id=None):
Parameters:
Parameters:
- the corpus itself
- the corpus itself
- scope: {"local" or "global"}
- count_scope: {"local" or "global"}
- local <=> frequencies counted in the current corpus
- global <=> frequencies counted in all corpora of this type
when the count_scope is global, there is another parameter:
- termset_scope: {"local" or "global"}
- local <=> output list of terms limited to the current corpus
(SELECT ngram_id FROM nodes_ngrams WHERE node_id IN <docs>)
- global <=> output list of terms from all corpora of this type
!!!! (more terms)
- overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
- overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
(the Node and its previous NodeNodeNgram rows will be replaced)
"""
"""
corpus_docids_subquery
=
(
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
subquery
()
)
# local <=> within this corpus
# local <=> within this corpus
if
scope
==
"local"
:
if
count_
scope
==
"local"
:
# All docs of this corpus
# All docs of this corpus
docids_subquery
=
(
session
count_scope_subquery
=
corpus_docids_subquery
.
query
(
Node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
termset_scope_subquery
=
(
session
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
query
(
NodeNgram
.
ngram_id
)
.
subquery
()
.
filter
(
NodeNgram
.
node_id
.
in_
(
corpus_docids_subquery
))
)
.
subquery
()
)
# global <=> within all corpora of this source
# global <=> within all corpora of this source
elif
scope
==
"global"
:
elif
count_
scope
==
"global"
:
this_source_type
=
corpus
.
resources
()[
0
][
'type'
]
this_source_type
=
corpus
.
resources
()[
0
][
'type'
]
# all corpora with the same source type
# all corpora with the same source type
# (we need raw SQL query for postgres JSON operators) (TODO test speed)
# (we need raw SQL query for postgres JSON operators) (TODO test speed)
same_source_corpora_query
=
(
session
same_source_corpora_query
=
(
session
.
query
(
Node
.
id
)
.
query
(
Node
.
id
)
.
from_statement
(
text
(
.
from_statement
(
text
(
"""
"""
SELECT id FROM nodes
SELECT id FROM nodes
WHERE hyperdata->'resources' @> '[{
\"
type
\"
\
:
%
s}]'
WHERE hyperdata->'resources' @> '[{
\"
type
\"
\
:
%
s}]'
"""
%
this_source_type
"""
%
this_source_type
))
))
)
)
# All docs **in all corpora of the same source**
# All docs **in all corpora of the same source**
docids_subquery
=
(
session
ressource_docids_subquery
=
(
session
.
query
(
Node
.
id
)
.
query
(
Node
.
id
)
.
filter
(
Node
.
parent_id
.
in_
(
same_source_corpora_query
))
.
filter
(
Node
.
parent_id
.
in_
(
same_source_corpora_query
))
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
subquery
()
)
count_scope_subquery
=
ressource_docids_subquery
if
termset_scope
==
"global"
:
termset_scope_subquery
=
(
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
.
in_
(
ressource_docids_subquery
))
.
subquery
()
)
else
:
termset_scope_subquery
=
(
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
.
in_
(
corpus_docids_subquery
))
.
subquery
()
.
subquery
()
)
)
# N
# N
total_docs
=
session
.
query
(
docids_subquery
)
.
count
()
total_docs
=
session
.
query
(
ressource_
docids_subquery
)
.
count
()
#
or perhaps at least
do the occurrences right now at the same time
#
nb: possible to
do the occurrences right now at the same time
tf_nd
=
(
session
tf_nd
=
(
session
.
query
(
.
query
(
NodeNgram
.
ngram_id
,
NodeNgram
.
ngram_id
,
func
.
sum
(
NodeNgram
.
weight
),
# tf: same as occnode
func
.
sum
(
NodeNgram
.
weight
),
# tf: same as occnode
func
.
count
(
NodeNgram
.
node_id
)
# nd: n docs with term
func
.
count
(
NodeNgram
.
node_id
)
# nd: n docs with term
)
)
.
filter
(
NodeNgram
.
node_id
.
in_
(
docids_subquery
))
.
filter
(
NodeNgram
.
node_id
.
in_
(
count_scope_subquery
))
.
filter
(
NodeNgram
.
ngram_id
.
in_
(
termset_scope_subquery
))
.
group_by
(
NodeNgram
.
ngram_id
)
.
group_by
(
NodeNgram
.
ngram_id
)
.
all
()
.
all
()
)
)
...
@@ -162,10 +200,10 @@ def compute_cumulated_tfidf(corpus, scope="local", overwrite_id=None):
...
@@ -162,10 +200,10 @@ def compute_cumulated_tfidf(corpus, scope="local", overwrite_id=None):
else
:
else
:
# create the new TFIDF-XXXX node
# create the new TFIDF-XXXX node
tfidf_nd
=
corpus
.
add_child
()
tfidf_nd
=
corpus
.
add_child
()
if
scope
==
"local"
:
# TODO discuss use and find new typename
if
count_
scope
==
"local"
:
# TODO discuss use and find new typename
tfidf_nd
.
typename
=
"TFIDF-CORPUS"
tfidf_nd
.
typename
=
"TFIDF-CORPUS"
tfidf_nd
.
name
=
"tfidf-cumul-corpus (in:
%
s)"
%
corpus
.
id
tfidf_nd
.
name
=
"tfidf-cumul-corpus (in:
%
s)"
%
corpus
.
id
elif
scope
==
"global"
:
elif
count_
scope
==
"global"
:
tfidf_nd
.
typename
=
"TFIDF-GLOBAL"
tfidf_nd
.
typename
=
"TFIDF-GLOBAL"
tfidf_nd
.
name
=
"tfidf-cumul-global (in type:
%
s)"
%
this_source_type
tfidf_nd
.
name
=
"tfidf-cumul-global (in type:
%
s)"
%
this_source_type
session
.
add
(
tfidf_nd
)
session
.
add
(
tfidf_nd
)
...
...
static/lib/gargantext/NGrams_dyna_chart_and_table.js
View file @
87f75264
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment