Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
6c438c85
Commit
6c438c85
authored
May 21, 2016
by
delanoe
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'romain-refactoring' into unstable
parents
f236759c
e52afd97
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
363 additions
and
198 deletions
+363
-198
constants.py
gargantext/constants.py
+6
-4
lists.py
gargantext/util/lists.py
+11
-7
__init__.py
gargantext/util/toolchain/__init__.py
+17
-13
list_main.py
gargantext/util/toolchain/list_main.py
+3
-0
metric_tfidf.py
gargantext/util/toolchain/metric_tfidf.py
+142
-52
ngram_coocs.py
gargantext/util/toolchain/ngram_coocs.py
+161
-97
NGrams_dyna_chart_and_table.js
static/lib/gargantext/NGrams_dyna_chart_and_table.js
+23
-25
No files found.
gargantext/constants.py
View file @
6c438c85
...
...
@@ -8,18 +8,19 @@ import re
LISTTYPES
=
{
'DOCUMENT'
:
WeightedList
,
'GROUPLIST'
:
Translations
,
'GROUPLIST'
:
Translations
,
# todo remove "LIST" from name
'STOPLIST'
:
UnweightedList
,
'MAINLIST'
:
UnweightedList
,
'MAPLIST'
:
UnweightedList
,
'SPECIFICITY'
:
WeightedList
,
'OCCURRENCES'
:
Weighted
ContextIndex
,
'OCCURRENCES'
:
Weighted
Index
,
# todo replace by WeightedList
'COOCCURRENCES'
:
WeightedMatrix
,
'TFIDF-CORPUS'
:
Weighted
ContextIndex
,
'TFIDF-GLOBAL'
:
Weighted
ContextIndex
,
'TFIDF-CORPUS'
:
Weighted
Index
,
# todo split -> WeightedList for ti_rank and WeightedIndex for tfidf
'TFIDF-GLOBAL'
:
Weighted
Index
,
# todo split -> WeightedList for ti_rank and WeightedIndex for tfidf
}
NODETYPES
=
[
# TODO separate id not array index, read by models.node
None
,
# documents hierarchy
'USER'
,
# 1
...
...
@@ -40,6 +41,7 @@ NODETYPES = [
'TFIDF-GLOBAL'
,
# 14
# docs subset
'FAVORITES'
# 15
# TODO add ti RANK
]
INDEXED_HYPERDATA
=
{
...
...
gargantext/util/lists.py
View file @
6c438c85
...
...
@@ -2,7 +2,7 @@
"""
__all__
=
[
'Translations'
,
'WeightedMatrix'
,
'UnweightedList'
,
'WeightedList'
,
'Weighted
Context
Index'
]
__all__
=
[
'Translations'
,
'WeightedMatrix'
,
'UnweightedList'
,
'WeightedList'
,
'WeightedIndex'
]
from
gargantext.util.db
import
session
,
bulk_insert
...
...
@@ -165,15 +165,18 @@ class Translations(_BaseClass):
)
class
Weighted
Context
Index
(
_BaseClass
):
class
WeightedIndex
(
_BaseClass
):
"""
associated model : NodeNodeNgram
associated columns : node1_id | node2_id | ngram_id | score (float)
^^^^
reserved for this
object's id
Tensor representing a contextual index or registry
(matrix of weighted ngrams *per* doc *per* context
)
Matrix representing a weighted word index across docs or small context nodes
(matrix of weighted ngrams *per* doc
)
Exemple : tfidf
by
corpus
Exemple : tfidf
within a
corpus
"""
def
__init__
(
self
,
source
=
None
):
self
.
items
=
defaultdict
(
float
)
...
...
@@ -182,7 +185,7 @@ class WeightedContextIndex(_BaseClass):
# ?TODO rename WeightedWordmatrix
class
WeightedMatrix
(
_BaseClass
):
def
__init__
(
self
,
source
=
None
):
...
...
@@ -294,7 +297,7 @@ class WeightedMatrix(_BaseClass):
result
.
items
[
key1
,
key2
]
=
value
/
sqrt
(
other
.
items
[
key1
]
*
other
.
items
[
key2
])
return
result
# ?TODO rename Wordlist
class
UnweightedList
(
_BaseClass
):
def
__init__
(
self
,
source
=
None
):
...
...
@@ -399,6 +402,7 @@ class UnweightedList(_BaseClass):
)
# ?TODO rename WeightedWordlist
class
WeightedList
(
_BaseClass
):
def
__init__
(
self
,
source
=
None
):
...
...
gargantext/util/toolchain/__init__.py
View file @
6c438c85
...
...
@@ -111,20 +111,16 @@ def parse_extract_indexhyperdata(corpus):
group_id
=
compute_groups
(
corpus
,
stoplist_id
=
None
)
print
(
'CORPUS #
%
d: [
%
s] new grouplist node #
%
i'
%
(
corpus
.
id
,
t
(),
group_id
))
# -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
occ_id
=
compute_occs
(
corpus
)
print
(
'CORPUS #
%
d: [
%
s] new occs node #
%
i'
%
(
corpus
.
id
,
t
(),
occ_id
))
# ------------
# -> write
local tfidf similarities to Node and NodeNodeNgram
ltfidf_id
=
compute_tfidf_local
(
corpus
)
print
(
'CORPUS #
%
d: [
%
s] new
localtfidf node #
%
i'
%
(
corpus
.
id
,
t
(),
ltfidf
_id
))
# -> write
occurrences to Node and NodeNodeNgram # (todo: NodeNgram)
occ_id
=
compute_occs
(
corpus
,
groupings_id
=
group_id
)
print
(
'CORPUS #
%
d: [
%
s] new
occs node #
%
i'
%
(
corpus
.
id
,
t
(),
occ
_id
))
# -> write
global and cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram
# -> write
cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram (todo: NodeNgram)
tirank_id
=
compute_ti_ranking
(
corpus
,
count_scope
=
"global"
,
termset_scope
=
"loc
al"
)
print
(
'CORPUS #
%
d: [
%
s] new t
fidf
ranking node #
%
i'
%
(
corpus
.
id
,
t
(),
tirank_id
))
groupings_id
=
group_id
,
count_scope
=
"glob
al"
)
print
(
'CORPUS #
%
d: [
%
s] new t
i
ranking node #
%
i'
%
(
corpus
.
id
,
t
(),
tirank_id
))
# -> mainlist: filter + write (to Node and NodeNgram)
mainlist_id
=
do_mainlist
(
corpus
,
...
...
@@ -132,13 +128,21 @@ def parse_extract_indexhyperdata(corpus):
stoplist_id
=
stop_id
)
print
(
'CORPUS #
%
d: [
%
s] new mainlist node #
%
i'
%
(
corpus
.
id
,
t
(),
mainlist_id
))
# -> write local tfidf similarities to Node and NodeNodeNgram
# TODO only on mainlist
ltfidf_id
=
compute_tfidf_local
(
corpus
)
print
(
'CORPUS #
%
d: [
%
s] new localtfidf node #
%
i'
%
(
corpus
.
id
,
t
(),
ltfidf_id
))
# => used for doc <=> ngram association
# ------------
# -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
cooc_id
=
compute_coocs
(
corpus
,
mainlist_id
=
mainlist_id
)
cooc_id
=
compute_coocs
(
corpus
,
mainlist_id
=
mainlist_id
,
groupings_id
=
group_id
)
print
(
'CORPUS #
%
d: [
%
s] new coocs node #
%
i'
%
(
corpus
.
id
,
t
(),
cooc_id
))
# -> specificity: compute + write (=> NodeNodeNgram)
spec_id
=
compute_specificity
(
corpus
,
cooc_id
=
cooc_id
)
spec_id
=
compute_specificity
(
corpus
,
cooc_id
=
cooc_id
# ,groupings_id = group_id
)
print
(
'CORPUS #
%
d: [
%
s] new specificity node #
%
i'
%
(
corpus
.
id
,
t
(),
spec_id
))
# ?? maplist: compute + write (to Node and NodeNgram)
...
...
gargantext/util/toolchain/list_main.py
View file @
6c438c85
...
...
@@ -65,6 +65,9 @@ def do_mainlist(corpus,
ordered_filtered_tfidf
=
(
session
.
query
(
NodeNodeNgram
.
ngram_id
)
.
filter
(
NodeNodeNgram
.
node1_id
==
ranking_scores_id
)
# NOT IN but speed theoretically ok here
# see http://sqlperformance.com/2012/12/t-sql-queries/left-anti-semi-join
# but http://stackoverflow.com/questions/2246772/whats-the-difference-between-not-exists-vs-not-in-vs-left-join-where-is-null/2246793#2246793
.
filter
(
~
NodeNodeNgram
.
ngram_id
.
in_
(
stopterms_subquery
))
.
order_by
(
desc
(
NodeNodeNgram
.
score
))
)
...
...
gargantext/util/toolchain/metric_tfidf.py
View file @
6c438c85
...
...
@@ -9,13 +9,15 @@ FIXME: "having the same source" means we need to select inside hyperdata
"""
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNodeNgram
,
NodeNgramNgram
from
gargantext.util.db_cache
import
cache
from
gargantext.util.db
import
session
,
bulk_insert
,
aliased
,
\
func
# = sqlalchemy.func like sum() or count()
from
sqlalchemy.sql.expression
import
case
# for choice if ngram has mainform or not
from
sqlalchemy
import
distinct
# for list of unique ngram_ids within a corpus
from
math
import
log
from
re
import
match
# £TODO
# from gargantext.util.lists import Weighted
Context
Index
# from gargantext.util.lists import WeightedIndex
def
compute_occs
(
corpus
,
overwrite_id
=
None
,
groupings_id
=
None
,):
...
...
@@ -32,7 +34,7 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
Parameters:
- overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
- groupings_id: optional id of a GROUPLIST node for th
is corpu
s
- groupings_id: optional id of a GROUPLIST node for th
ese ngram
s
IF absent the occurrences are the sums for each ngram
IF present they're the sums for each ngram's mainform
"""
...
...
@@ -115,7 +117,8 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
if
overwrite_id
:
# overwrite pre-existing id
the_id
=
overwrite_id
# occnode = cache.Node[overwrite_id]
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
node1_id
==
the_id
)
.
delete
()
session
.
commit
()
else
:
# create the new OCCURRENCES node
occnode
=
corpus
.
add_child
(
...
...
@@ -126,8 +129,8 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
session
.
commit
()
the_id
=
occnode
.
id
#
reflect that in NodeNodeNgrams (could be NodeNgram but harmony with tfidf)
#
£TODO replace bulk_insert by something like WeightedContextMatrix.save(
)
#
£TODO make it NodeNgram instead NodeNodeNgram ! and rebase :/
#
(idem ti_ranking
)
bulk_insert
(
NodeNodeNgram
,
(
'node1_id'
,
'node2_id'
,
'ngram_id'
,
'score'
),
...
...
@@ -137,14 +140,26 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
return
the_id
def
compute_ti_ranking
(
corpus
,
count_scope
=
"local"
,
termset_scope
=
"local"
,
overwrite_id
=
None
):
def
compute_ti_ranking
(
corpus
,
groupings_id
=
None
,
count_scope
=
"local"
,
termset_scope
=
"local"
,
overwrite_id
=
None
):
"""
# TODO check if cumulated tfs correspond to app's use cases and intention
Calculates tfidf ranking (cumulated tfidf for each ngram) within given scope
Calculates tfidf ranking within given scope
----------
|
via weighting of
cumulated tfidf --------- Sum{i}(tf_ij) * ln(N/|U{i}(docs{mot€d})|)
per ngram ng_i
(or per mainform ng_i' if groups)
across some docs d_j
Parameters:
- the corpus itself
- the corpus itself (or corpus_id)
- groupings_id: optional id of a GROUPLIST node for these ngrams
IF absent the ti weights are the sums for each ngram
IF present they're the sums for each ngram's mainform
- count_scope: {"local" or "global"}
- local <=> frequencies counted in the current corpus
- global <=> frequencies counted in all corpora of this type
...
...
@@ -153,43 +168,94 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
- termset_scope: {"local" or "global"}
- local <=> output list of terms limited to the current corpus
(SELECT DISTINCT ngram_id FROM nodes_ngrams WHERE node_id IN <docs>)
- global <=> output list of terms f
rom all corpora of this ty
pe
- global <=> output list of terms f
ound in global doc sco
pe
!!!! (many more terms)
- overwrite_id: optional id of a pre-existing
TFIDF-
XXXX node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
- overwrite_id: optional id of a pre-existing XXXX node for this corpus
(the Node and its previous Node
NodeNgram rows will be replaced)
"""
# validate string params
if
count_scope
not
in
[
"local"
,
"global"
]:
raise
ValueError
(
"compute_ti_ranking: count_scope param allowed values: 'local', 'global'"
)
if
termset_scope
not
in
[
"local"
,
"global"
]:
raise
ValueError
(
"compute_ti_ranking: termset_scope param allowed values: 'local', 'global'"
)
if
count_scope
==
"local"
and
termset_scope
==
"global"
:
raise
ValueError
(
"compute_ti_ranking: the termset_scope param can be 'global' iff count_scope param is 'global' too."
)
# get corpus
if
type
(
corpus
)
==
int
:
corpus_id
=
corpus
corpus
=
cache
.
Node
[
corpus_id
]
elif
type
(
corpus
)
==
str
and
match
(
r'\d+$'
,
corpus
):
corpus_id
=
int
(
corpus
)
corpus
=
cache
.
Node
[
corpus_id
]
else
:
# assuming Node class
corpus_id
=
corpus
.
id
# prepare sqla mainform vs ngram selector
ngform_i
=
None
if
not
groupings_id
:
ngform_i
=
NodeNgram
.
ngram_id
else
:
# prepare translations
syno
=
(
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
groupings_id
)
.
subquery
()
)
# cf commentaire détaillé dans compute_occs() + todo facto
ngform_i
=
case
([
(
syno
.
c
.
ngram1_id
!=
None
,
syno
.
c
.
ngram1_id
),
(
syno
.
c
.
ngram1_id
==
None
,
NodeNgram
.
ngram_id
)
# condition value
])
# MAIN QUERY SKELETON
tf_nd_query
=
(
session
.
query
(
NodeNgram
.
ngram_id
,
# NodeNgram.ngram_id
# or similar if grouping ngrams under their mainform
ngform_i
.
label
(
"counted_ngform"
),
# the tfidf elements
# ------------------
func
.
sum
(
NodeNgram
.
weight
),
# tf: same as occurrences
# -----------------------
func
.
count
(
NodeNgram
.
node_id
)
# nd: n docs with term
# --------------------
)
.
group_by
(
NodeNgram
.
ngram_id
)
.
group_by
(
"counted_ngform"
)
#
optional *count_scope*: if we'll restrict the doc nodes
#
--
-----------
#
count_scope to specify in which doc nodes to count
# -----------
# .join(countdocs_subquery,
# countdocs_subquery.c.id == NodeNgram.node_id)
# optional
*termset_scope*
: if we'll restrict the ngrams
# -------------
--
# optional
termset_scope
: if we'll restrict the ngrams
# -------------
# .join(termset_subquery,
# termset_subquery.c.uniq_ngid == NodeNgram.ngram_id)
# optional translations to bring the subform's replacement
# ------------
# .outerjoin(syno,
# syno.c.ngram2_id == NodeNgram.ngram_id)
)
# validate string params
if
count_scope
not
in
[
"local"
,
"global"
]:
raise
ValueError
(
"compute_ti_ranking: count_scope param allowed values: 'local', 'global'"
)
if
termset_scope
not
in
[
"local"
,
"global"
]:
raise
ValueError
(
"compute_ti_ranking: termset_scope param allowed values: 'local', 'global'"
)
if
count_scope
==
"local"
and
termset_scope
==
"global"
:
raise
ValueError
(
"compute_ti_ranking: the termset_scope param can be 'global' iff count_scope param is 'global' too."
)
# TUNING THE QUERY
if
groupings_id
:
tf_nd_query
=
tf_nd_query
.
outerjoin
(
syno
,
syno
.
c
.
ngram2_id
==
NodeNgram
.
ngram_id
)
# local <=> within this corpus
if
count_scope
==
"local"
:
...
...
@@ -197,14 +263,14 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
countdocs_subquery
=
(
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
_
id
)
.
subquery
()
)
#
both scopes are the same:
no need to independantly restrict the ngrams
# no need to independantly restrict the ngrams
tf_nd_query
=
tf_nd_query
.
join
(
countdocs_subquery
,
countdocs_subquery
.
c
.
id
==
NodeNgram
.
node_id
)
# ---
# global <=> within all corpora of this source
elif
count_scope
==
"global"
:
...
...
@@ -220,6 +286,7 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
# join on parent_id with selected corpora nodes
.
join
(
CorpusNode
,
CorpusNode
.
id
==
Node
.
parent_id
)
.
filter
(
CorpusNode
.
typename
==
"CORPUS"
)
# TODO index corpus_sourcetype in DB
.
filter
(
CorpusNode
.
hyperdata
[
'resources'
][
0
][
'type'
]
.
astext
==
str
(
this_source_type
))
.
subquery
()
)
...
...
@@ -228,15 +295,19 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
# both scopes are the same: no need to independantly restrict the ngrams
tf_nd_query
=
tf_nd_query
.
join
(
countdocs_subquery
,
countdocs_subquery
.
c
.
id
==
NodeNgram
.
node_id
)
# ---
elif
termset_scope
==
"local"
:
# All unique terms
in the original corpus
# All unique terms
...
termset_subquery
=
(
session
.
query
(
distinct
(
NodeNgram
.
ngram_id
)
.
label
(
"uniq_ngid"
))
.
query
(
distinct
(
NodeNgram
.
ngram_id
)
.
label
(
"uniq_ngid"
)
)
# ... in the original corpus
.
join
(
Node
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
_
id
)
.
subquery
()
)
...
...
@@ -247,42 +318,59 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
.
join
(
termset_subquery
,
termset_subquery
.
c
.
uniq_ngid
==
NodeNgram
.
ngram_id
)
)
# ---
#
N
#
M
total_docs
=
session
.
query
(
countdocs_subquery
)
.
count
()
log_tot_docs
=
log
(
total_docs
)
# result
tf_nd
=
tf_nd_query
.
all
()
# -------------------------------------------------
tfidfs
=
{}
log_tot_docs
=
log
(
total_docs
)
for
(
ngram_id
,
tf
,
nd
)
in
tf_nd
:
# tfidfs[ngram_id] = tf * log(total_docs/nd)
tfidfs
[
ngram_id
]
=
tf
*
(
log_tot_docs
-
log
(
nd
))
# -------------------------------------------------
# -------------- "sommatoire" sur mot i ----------------
tfidfsum
=
{}
for
(
ngram_i
,
tf_i
,
nd_i
)
in
tf_nd
:
# tfidfsum[ngram_i] = tf_i * log(total_docs/nd_i)
tfidfsum
[
ngram_i
]
=
tf_i
*
(
log_tot_docs
-
log
(
nd_i
))
# ------------------------------------------------------
# N pour info
total_ngramforms
=
len
(
tfidfsum
)
if
overwrite_id
:
the_id
=
overwrite_id
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
node1_id
==
the_id
)
.
delete
()
session
.
commit
()
else
:
# create the new TFIDF-XXXX node
tfidf_nd
=
corpus
.
add_child
()
if
count_scope
==
"local"
:
# TODO discuss use and find new typename
tfidf_nd
.
typename
=
"TFIDF-CORPUS"
tfidf_nd
.
name
=
"tfidf-cumul-corpus (in:
%
s)"
%
corpus
.
id
# create the new TFIDF-XXXX node to get an id
tir_nd
=
corpus
.
add_child
()
if
count_scope
==
"local"
:
tir_nd
.
typename
=
"TFIDF-CORPUS"
tir_nd
.
name
=
"ti rank (
%
i ngforms in corpus:
%
s)"
%
(
total_ngramforms
,
corpus_id
)
elif
count_scope
==
"global"
:
tfidf_nd
.
typename
=
"TFIDF-GLOBAL"
tfidf_nd
.
name
=
"tfidf-cumul-global (in type:
%
s)"
%
this_source_type
session
.
add
(
tfidf_nd
)
tir_nd
.
typename
=
"TFIDF-GLOBAL"
tir_nd
.
name
=
"ti rank (
%
i ngforms
%
s in corpora of sourcetype:
%
s)"
%
(
total_ngramforms
,
(
"from corpus
%
i"
%
corpus_id
)
if
(
termset_scope
==
"local"
)
else
""
,
this_source_type
)
session
.
add
(
tir_nd
)
session
.
commit
()
the_id
=
tfidf_nd
.
id
the_id
=
tir_nd
.
id
# TODO 1 discuss use and find new typename
# TODO 2 release these 2 typenames TFIDF-CORPUS and TFIDF-GLOBAL
# TODO 3 recreate them elsewhere in their sims (WeightedIndex) version
# TODO 4 requalify this here as a NodeNgram
# then TODO 5 use WeightedList.save() !
# reflect that in NodeNodeNgrams
# £TODO replace bulk_insert by something like WeightedContextMatrix.save()
bulk_insert
(
NodeNodeNgram
,
(
'node1_id'
,
'node2_id'
,
'ngram_id'
,
'score'
),
((
the_id
,
corpus
.
id
,
ng
,
tfidfs
[
ng
])
for
ng
in
tfidfs
)
((
the_id
,
corpus_id
,
ng
,
tfidfsum
[
ng
])
for
ng
in
tfidfsum
)
)
return
the_id
...
...
@@ -347,6 +435,8 @@ def compute_tfidf_local(corpus, overwrite_id=None):
if
overwrite_id
:
the_id
=
overwrite_id
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
node1_id
==
the_id
)
.
delete
()
session
.
commit
()
else
:
# create the new TFIDF-CORPUS node
tfidf_node
=
corpus
.
add_child
()
...
...
@@ -357,7 +447,7 @@ def compute_tfidf_local(corpus, overwrite_id=None):
the_id
=
tfidf_node
.
id
# reflect that in NodeNodeNgrams
# £TODO replace bulk_insert by something like Weighted
ContextMatri
x.save()
# £TODO replace bulk_insert by something like Weighted
Inde
x.save()
bulk_insert
(
NodeNodeNgram
,
(
'node1_id'
,
'node2_id'
,
'ngram_id'
,
'score'
),
...
...
gargantext/util/toolchain/ngram_coocs.py
View file @
6c438c85
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNgramNgram
,
\
NodeHyperdata
NodeHyperdata
,
Ngram
from
gargantext.util.lists
import
WeightedMatrix
from
gargantext.util.db
import
session
,
aliased
,
func
from
gargantext.util.db_cache
import
cache
from
gargantext.constants
import
DEFAULT_COOC_THRESHOLD
from
datetime
import
datetime
from
sqlalchemy.sql.expression
import
case
# for choice if ngram has mainform or not
def
compute_coocs
(
corpus
,
overwrite_id
=
None
,
threshold
=
DEFAULT_COOC_THRESHOLD
,
groupings_id
=
None
,
mainlist_id
=
None
,
stoplist_id
=
None
,
start
=
None
,
...
...
@@ -41,9 +44,11 @@ def compute_coocs( corpus,
- overwrite_id: id of a pre-existing COOCCURRENCES node for this corpus
(all hyperdata and previous NodeNgramNgram rows will be replaced)
- threshold: on output cooc count (previously called hapax)
- groupings_id: optional synonym relations to add all subform counts
with their mainform's counts
- mainlist_id: mainlist to constrain the input ngrams
- stoplist_id: stoplist for filtering input ngrams
(normally unnecessary if a mainlist is provided)
(normally unnecessary if a mainlist is
already
provided)
- start, end: provide one or both temporal limits to filter on doc date
NB the expected type of parameter value is datetime.datetime
(string is also possible but format must follow
...
...
@@ -56,25 +61,24 @@ def compute_coocs( corpus,
basic idea for one doc
======================
each pair of ngrams sharing same doc (node_id)
SELEC idx
1.ngram_id, idx2
.ngram_id
FROM nodes_ngrams AS idx
1, nodes_ngrams AS idx2
SELEC idx
a.ngram_id, idxb
.ngram_id
FROM nodes_ngrams AS idx
a, nodes_ngrams AS idxb
---------------------------------
WHERE idx
1.node_id = idx2
.node_id <== that's cooc
WHERE idx
a.node_id = idxb
.node_id <== that's cooc
---------------------------------
AND idx
1.ngram_id <> idx2
.ngram_id
AND idx
1
.node_id = MY_DOC ;
AND idx
a.ngram_id <> idxb
.ngram_id
AND idx
a
.node_id = MY_DOC ;
on entire corpus
=================
coocs for each doc :
- each given pair like (termA, termB) will likely appear several times
=> we do GROUP BY (
x1.ngram_id, x2
.ngram_id)
=> we do GROUP BY (
Xindex.ngram_id, Yindex
.ngram_id)
- we count unique appearances of the pair (cooc)
"""
# - TODO add grouped element's values in grouping 'chief ngram'
# - TODO cvalue_id: allow a metric as additional input filter
# - TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
# - TODO weighted: if False normal cooc to be saved as result
...
...
@@ -85,130 +89,190 @@ def compute_coocs( corpus,
# 1.859.408 lignes pour la requête cooc simple
# 71.134 lignes en se limitant aux ngrammes qui ont une occ > 1 (weight)
# docs of our corpus
docids_subquery
=
(
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
# 2 x the occurrence index table
Xindex
=
aliased
(
NodeNgram
)
Yindex
=
aliased
(
NodeNgram
)
# for debug (1/4)
# Xngram = aliased(Ngram)
# Yngram = aliased(Ngram)
# 1) prepare definition of counted forms
if
not
groupings_id
:
# no groupings => the counted forms are the ngrams
Xindex_ngform_id
=
Xindex
.
ngram_id
Yindex_ngform_id
=
Yindex
.
ngram_id
# groupings: cf commentaire détaillé dans compute_occs() + todo facto
else
:
# prepare translations
Xsyno
=
(
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
groupings_id
)
.
subquery
()
)
# 2 x the occurrence index table
x1
=
aliased
(
NodeNgram
)
x2
=
aliased
(
NodeNgram
)
# further use as anon tables prevent doing Ysyno = Xsyno
Ysyno
=
(
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
groupings_id
)
.
subquery
()
)
# cooccurrences columns definition
ucooc
=
func
.
count
(
x1
.
ngram_id
)
.
label
(
"ucooc"
)
# groupings => define the counted form depending on the existence of a synonym
Xindex_ngform_id
=
case
([
(
Xsyno
.
c
.
ngram1_id
!=
None
,
Xsyno
.
c
.
ngram1_id
),
(
Xsyno
.
c
.
ngram1_id
==
None
,
Xindex
.
ngram_id
)
# condition value
])
# 1) MAIN DB QUERY
coocs_query
=
(
session
.
query
(
x1
.
ngram_id
,
x2
.
ngram_id
,
ucooc
)
.
join
(
Node
,
Node
.
id
==
x1
.
node_id
)
# <- b/c within corpus
.
join
(
x2
,
x1
.
node_id
==
Node
.
id
)
# <- b/c within corpus
Yindex_ngform_id
=
case
([
(
Ysyno
.
c
.
ngram1_id
!=
None
,
Ysyno
.
c
.
ngram1_id
),
(
Ysyno
.
c
.
ngram1_id
==
None
,
Yindex
.
ngram_id
)
])
# ---
# 2) BASE DB QUERY
# cooccurrences columns definition ----------------
ucooc
=
func
.
count
(
Xindex_ngform_id
)
.
label
(
"ucooc"
)
# NB could be X or Y in this line
# (we're counting grouped rows and just happen to do it on this column)
base_query
=
(
session
.
query
(
Xindex_ngform_id
,
Yindex_ngform_id
,
ucooc
# for debug (2/4)
#, Xngram.terms.label("w_x")
#, Yngram.terms.label("w_y")
)
.
join
(
Yindex
,
Xindex
.
node_id
==
Yindex
.
node_id
)
# <- by definition of cooc
.
join
(
Node
,
Node
.
id
==
Xindex
.
node_id
)
# <- b/c within corpus
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
# <- b/c within corpus
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
# <- b/c within corpus
.
filter
(
Xindex_ngform_id
!=
Yindex_ngform_id
)
# <- b/c not with itself
)
# outerjoin the synonyms if needed
if
groupings_id
:
base_query
=
(
base_query
.
outerjoin
(
Xsyno
,
# <- synonyms for Xindex.ngrams
Xsyno
.
c
.
ngram2_id
==
Xindex
.
ngram_id
)
.
outerjoin
(
Ysyno
,
# <- synonyms for Yindex.ngrams
Ysyno
.
c
.
ngram2_id
==
Yindex
.
ngram_id
)
)
# 3) counting clause in any case
coocs_query
=
(
base_query
.
group_by
(
Xindex_ngform_id
,
Yindex_ngform_id
# <- what we're counting
# for debug (3/4)
#,"w_x", "w_y"
)
# for debug (4/4)
#.join(Xngram, Xngram.id == Xindex_ngform_id)
#.join(Yngram, Yngram.id == Yindex_ngform_id)
.
filter
(
x1
.
node_id
==
x2
.
node_id
)
# <- by definition of cooc
.
filter
(
x1
.
ngram_id
!=
x2
.
ngram_id
)
# <- b/c not with itself
.
group_by
(
x1
.
ngram_id
,
x2
.
ngram_id
)
.
order_by
(
ucooc
)
)
# 2) INPUT FILTERS (reduce N before O(N²))
# 4) INPUT FILTERS (reduce N before O(N²))
if
mainlist_id
:
m1
=
aliased
(
NodeNgram
)
m2
=
aliased
(
NodeNgram
)
coocs_query
=
(
coocs_query
.
join
(
m1
,
m1
.
ngram_id
==
x1
.
ngra
m_id
)
.
join
(
m2
,
m2
.
ngram_id
==
x2
.
ngra
m_id
)
.
join
(
m1
,
m1
.
ngram_id
==
Xindex_ngfor
m_id
)
.
join
(
m2
,
m2
.
ngram_id
==
Yindex_ngfor
m_id
)
.
filter
(
m1
.
node_id
==
mainlist_id
)
.
filter
(
m2
.
node_id
==
mainlist_id
)
)
if
stoplist_id
:
s1
=
aliased
(
NodeNgram
)
s2
=
aliased
(
NodeNgram
)
s1
=
(
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
stoplist_id
)
.
subquery
()
)
# further use as anon tables prevent doing s2 = s1
s2
=
(
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
stoplist_id
)
.
subquery
()
)
coocs_query
=
(
coocs_query
.
join
(
m1
,
s1
.
ngram_id
==
x1
.
ngram_id
)
.
join
(
m2
,
s2
.
ngram_id
==
x2
.
ngram_id
)
.
outerjoin
(
s1
,
s1
.
c
.
ngram_id
==
Xindex_ngform_id
)
.
outerjoin
(
s2
,
s2
.
c
.
ngram_id
==
Yindex_ngform_id
)
# équivalent NOT IN stoplist
.
filter
(
s1
.
c
.
ngram_id
==
None
)
.
filter
(
s2
.
c
.
ngram_id
==
None
)
.
filter
(
s1
.
node_id
==
mainlist_id
)
.
filter
(
s2
.
node_id
==
mainlist_id
)
)
if
start
:
if
isinstance
(
start
,
datetime
):
start_str
=
start
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
else
:
start_str
=
str
(
start
)
# doc_ids matching this limit
# TODO s/subqueries/inner joins/ && thanks!
starttime_subquery
=
(
session
.
query
(
NodeHyperdata
.
node_id
)
.
filter
(
NodeHyperdata
.
key
==
"publication_date"
)
.
filter
(
NodeHyperdata
.
value_str
>=
start_str
)
.
subquery
()
if
start
or
end
:
Time
=
aliased
(
NodeHyperdata
)
coocs_query
=
(
coocs_query
.
join
(
Time
,
Time
.
node_id
==
Xindex
.
node_id
)
.
filter
(
Time
.
key
==
"publication_date"
)
)
# direct use of str comparison op because there is consistency b/w
# sql alpha sort and chrono sort *in this format %Y-%m-%d %H:%M:%S*
if
start
:
if
not
isinstance
(
start
,
datetime
):
try
:
start
=
datetime
.
strptime
(
start
,
'
%
Y-
%
m-
%
d'
)
except
:
raise
TypeError
(
"'start' param expects datetime object or
%%
Y-
%%
m-
%%
d string"
)
# the filtering by start limit
coocs_query
=
coocs_query
.
filter
(
x1
.
node_id
.
in_
(
starttime_subquery
)
)
coocs_query
=
coocs_query
.
filter
(
Time
.
value_utc
>=
start
)
if
end
:
if
isinstance
(
end
,
datetime
):
end_str
=
end
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
else
:
end_str
=
str
(
end
)
if
not
isinstance
(
end
,
datetime
):
try
:
end
=
datetime
.
strptime
(
end
,
'
%
Y-
%
m-
%
d'
)
except
:
raise
TypeError
(
"'end' param expects datetime object or
%%
Y-
%%
m-
%%
d string"
)
# TODO s/subqueries/inner joins/ && thanks!
endtime_subquery
=
(
session
.
query
(
NodeHyperdata
.
node_id
)
.
filter
(
NodeHyperdata
.
key
==
"publication_date"
)
.
filter
(
NodeHyperdata
.
value_str
<=
end_str
)
.
subquery
()
)
# the filtering by end limit
coocs_query
=
coocs_query
.
filter
(
x1
.
node_id
.
in_
(
endtime_subquery
))
# the filtering by start limit
coocs_query
=
coocs_query
.
filter
(
Time
.
value_utc
<=
end
)
if
symmetry_filter
:
# 1 filtre tenant en compte de la symétrie
# -> réduit le travail de moitié !!
# -> mais empêchera l'accès direct aux cooccurrences de x2
# -> seront éparpillées: notées dans les x1 qui ont précédé x2
# -> récupération sera plus couteuse via des requêtes OR comme:
# -> mais récupération sera plus couteuse via des requêtes OR comme:
# WHERE ngram1 = mon_ngram OR ngram2 = mon_ngram
coocs_query
=
coocs_query
.
filter
(
x1
.
ngram_id
<
x2
.
ngra
m_id
)
coocs_query
=
coocs_query
.
filter
(
Xindex_ngform_id
<
Yindex_ngfor
m_id
)
# ------------
# 2 filtres amont possibles pour réduire combinatoire
# - par exemple 929k lignes => 35k lignes
# - ici sur weight mais dégrade les résultats
# => imaginable sur une autre métrique (cvalue ou tfidf?)
# coocs_query = coocs_query.filter(x1.weight > 1)
# coocs_query = coocs_query.filter(x2.weight > 1)
# ------------
# 3) OUTPUT FILTERS
# 5) OUTPUT FILTERS
# ------------------
# threshold
# £TODO adjust COOC_THRESHOLD a posteriori:
# ex: sometimes 2 sometimes 4 depending on sparsity
coocs_query
=
coocs_query
.
having
(
ucooc
>=
threshold
)
# 4) EXECUTE QUERY
# 6) EXECUTE QUERY
# ----------------
# => storage in our matrix structure
matrix
=
WeightedMatrix
(
coocs_query
.
all
())
# -------------------
# fyi
shape_0
=
len
({
pair
[
0
]
for
pair
in
matrix
.
items
})
...
...
static/lib/gargantext/NGrams_dyna_chart_and_table.js
View file @
6c438c85
...
...
@@ -2042,6 +2042,29 @@ function AfterAjax(sourceUrl) {
// console.log(JSON.stringify(NGrams))
// -------------------------------------------------------------------
// ----------------------------------------- MAPLIST
// keepstateId = 1
keepstateId
=
System
[
0
][
"statesD"
][
"keep"
]
if
(
Object
.
keys
(
NGrams
[
"map"
]).
length
>
0
)
{
for
(
var
ngram_id
in
NGrams
[
"map"
])
{
myNgramInfo
=
NGrams
[
"main"
].
ngrams
[
ngram_id
]
// initialize state of maplist items
myNgramInfo
[
"state"
]
=
keepstateId
;
}
}
// ----------------------------------------- STOPLIST
// delstateId = 2
delstateId
=
System
[
0
][
"statesD"
][
"delete"
]
if
(
Object
.
keys
(
NGrams
[
"stop"
]).
length
>
0
)
{
for
(
var
ngram_id
in
NGrams
[
"stop"
])
{
console
.
log
(
'stopping '
+
ngram_id
)
myNgramInfo
=
NGrams
[
"main"
].
ngrams
[
ngram_id
]
// initialize state of stoplist items
myNgramInfo
[
"state"
]
=
delstateId
;
}
}
// Deleting subforms from the ngrams-table, clean start baby!
if
(
Object
.
keys
(
NGrams
[
"group"
].
links
).
length
>
0
)
{
...
...
@@ -2056,10 +2079,6 @@ function AfterAjax(sourceUrl) {
}
}
// debug:
// console.log('~~~~~~~~~~~~~> (sub) _forms')
// console.log( _forms )
// ------------------------------------------- MAINLIST
// ngrams_data_ will update NGrams.main.ngrams (with subforms removed)
var
ngrams_data_
=
{}
...
...
@@ -2093,27 +2112,6 @@ function AfterAjax(sourceUrl) {
// console.log( NGrams["main"] )
// ----------------------------------------- MAPLIST
if
(
Object
.
keys
(
NGrams
[
"map"
]).
length
>
0
)
{
for
(
var
ngram_id
in
NGrams
[
"main"
].
ngrams
)
{
myNgram
=
NGrams
[
"main"
].
ngrams
[
ngram_id
]
if
(
NGrams
[
"map"
][
ngram_id
])
{
// keepstateId = 1
keepstateId
=
System
[
0
][
"statesD"
][
"keep"
]
// initialize state of maplist items
myNgram
[
"state"
]
=
keepstateId
;
}
else
if
(
NGrams
[
"stop"
][
ngram_id
])
{
// delstateId = 2
delstateId
=
System
[
0
][
"statesD"
][
"delete"
]
// initialize state of stoplist items
myNgram
[
"state"
]
=
delstateId
;
}
}
}
// Building the Score-Selector //NGrams["scores"]
var
FirstScore
=
NGrams
[
"main"
].
scores
.
initial
// TODO scores_div
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment