Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
1f1e23ce
Commit
1f1e23ce
authored
May 19, 2016
by
Romain Loth
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
groupings in ti_rank
parent
aa848fd9
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
143 additions
and
52 deletions
+143
-52
metric_tfidf.py
gargantext/util/toolchain/metric_tfidf.py
+143
-52
No files found.
gargantext/util/toolchain/metric_tfidf.py
View file @
1f1e23ce
...
@@ -9,13 +9,15 @@ FIXME: "having the same source" means we need to select inside hyperdata
...
@@ -9,13 +9,15 @@ FIXME: "having the same source" means we need to select inside hyperdata
"""
"""
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNodeNgram
,
NodeNgramNgram
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNodeNgram
,
NodeNgramNgram
from
gargantext.util.db_cache
import
cache
from
gargantext.util.db
import
session
,
bulk_insert
,
aliased
,
\
from
gargantext.util.db
import
session
,
bulk_insert
,
aliased
,
\
func
# = sqlalchemy.func like sum() or count()
func
# = sqlalchemy.func like sum() or count()
from
sqlalchemy.sql.expression
import
case
# for choice if ngram has mainform or not
from
sqlalchemy.sql.expression
import
case
# for choice if ngram has mainform or not
from
sqlalchemy
import
distinct
# for list of unique ngram_ids within a corpus
from
sqlalchemy
import
distinct
# for list of unique ngram_ids within a corpus
from
math
import
log
from
math
import
log
from
re
import
match
# £TODO
# £TODO
# from gargantext.util.lists import Weighted
Context
Index
# from gargantext.util.lists import WeightedIndex
def
compute_occs
(
corpus
,
overwrite_id
=
None
,
groupings_id
=
None
,):
def
compute_occs
(
corpus
,
overwrite_id
=
None
,
groupings_id
=
None
,):
...
@@ -32,7 +34,7 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
...
@@ -32,7 +34,7 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
Parameters:
Parameters:
- overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus
- overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
(the Node and its previous NodeNodeNgram rows will be replaced)
- groupings_id: optional id of a GROUPLIST node for th
is corpu
s
- groupings_id: optional id of a GROUPLIST node for th
ese ngram
s
IF absent the occurrences are the sums for each ngram
IF absent the occurrences are the sums for each ngram
IF present they're the sums for each ngram's mainform
IF present they're the sums for each ngram's mainform
"""
"""
...
@@ -61,6 +63,7 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
...
@@ -61,6 +63,7 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
# ------------
# ------------
# (the occurrences are the sums for each ngram's mainform)
# (the occurrences are the sums for each ngram's mainform)
else
:
else
:
print
(
"gtoup mode"
)
# sub-SELECT the synonyms of this GROUPLIST id (for OUTER JOIN later)
# sub-SELECT the synonyms of this GROUPLIST id (for OUTER JOIN later)
syn
=
(
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
syn
=
(
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
)
NodeNgramNgram
.
ngram2_id
)
...
@@ -115,7 +118,8 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
...
@@ -115,7 +118,8 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
if
overwrite_id
:
if
overwrite_id
:
# overwrite pre-existing id
# overwrite pre-existing id
the_id
=
overwrite_id
the_id
=
overwrite_id
# occnode = cache.Node[overwrite_id]
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
node1_id
==
the_id
)
.
delete
()
session
.
commit
()
else
:
else
:
# create the new OCCURRENCES node
# create the new OCCURRENCES node
occnode
=
corpus
.
add_child
(
occnode
=
corpus
.
add_child
(
...
@@ -126,8 +130,8 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
...
@@ -126,8 +130,8 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
session
.
commit
()
session
.
commit
()
the_id
=
occnode
.
id
the_id
=
occnode
.
id
#
reflect that in NodeNodeNgrams (could be NodeNgram but harmony with tfidf)
#
£TODO make it NodeNgram instead NodeNodeNgram ! and rebase :/
#
£TODO replace bulk_insert by something like WeightedContextMatrix.save(
)
#
(idem ti_ranking
)
bulk_insert
(
bulk_insert
(
NodeNodeNgram
,
NodeNodeNgram
,
(
'node1_id'
,
'node2_id'
,
'ngram_id'
,
'score'
),
(
'node1_id'
,
'node2_id'
,
'ngram_id'
,
'score'
),
...
@@ -137,14 +141,26 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
...
@@ -137,14 +141,26 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
return
the_id
return
the_id
def
compute_ti_ranking
(
corpus
,
count_scope
=
"local"
,
termset_scope
=
"local"
,
overwrite_id
=
None
):
def
compute_ti_ranking
(
corpus
,
groupings_id
=
None
,
count_scope
=
"local"
,
termset_scope
=
"local"
,
overwrite_id
=
None
):
"""
"""
# TODO check if cumulated tfs correspond to app's use cases and intention
Calculates tfidf ranking within given scope
----------
Calculates tfidf ranking (cumulated tfidf for each ngram) within given scope
|
via weighting of
cumulated tfidf --------- Sum{i}(tf_ij) * ln(N/|U{i}(docs{mot€d})|)
per ngram ng_i
(or per mainform ng_i' if groups)
across some docs d_j
Parameters:
Parameters:
- the corpus itself
- the corpus itself (or corpus_id)
- groupings_id: optional id of a GROUPLIST node for these ngrams
IF absent the ti weights are the sums for each ngram
IF present they're the sums for each ngram's mainform
- count_scope: {"local" or "global"}
- count_scope: {"local" or "global"}
- local <=> frequencies counted in the current corpus
- local <=> frequencies counted in the current corpus
- global <=> frequencies counted in all corpora of this type
- global <=> frequencies counted in all corpora of this type
...
@@ -153,43 +169,94 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
...
@@ -153,43 +169,94 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
- termset_scope: {"local" or "global"}
- termset_scope: {"local" or "global"}
- local <=> output list of terms limited to the current corpus
- local <=> output list of terms limited to the current corpus
(SELECT DISTINCT ngram_id FROM nodes_ngrams WHERE node_id IN <docs>)
(SELECT DISTINCT ngram_id FROM nodes_ngrams WHERE node_id IN <docs>)
- global <=> output list of terms f
rom all corpora of this ty
pe
- global <=> output list of terms f
ound in global doc sco
pe
!!!! (many more terms)
!!!! (many more terms)
- overwrite_id: optional id of a pre-existing
TFIDF-
XXXX node for this corpus
- overwrite_id: optional id of a pre-existing XXXX node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
(the Node and its previous Node
NodeNgram rows will be replaced)
"""
"""
# validate string params
if
count_scope
not
in
[
"local"
,
"global"
]:
raise
ValueError
(
"compute_ti_ranking: count_scope param allowed values: 'local', 'global'"
)
if
termset_scope
not
in
[
"local"
,
"global"
]:
raise
ValueError
(
"compute_ti_ranking: termset_scope param allowed values: 'local', 'global'"
)
if
count_scope
==
"local"
and
termset_scope
==
"global"
:
raise
ValueError
(
"compute_ti_ranking: the termset_scope param can be 'global' iff count_scope param is 'global' too."
)
# get corpus
if
type
(
corpus
)
==
int
:
corpus_id
=
corpus
corpus
=
cache
.
Node
[
corpus_id
]
elif
type
(
corpus
)
==
str
and
match
(
r'\d+$'
,
corpus
):
corpus_id
=
int
(
corpus
)
corpus
=
cache
.
Node
[
corpus_id
]
else
:
# assuming Node class
corpus_id
=
corpus
.
id
# prepare sqla mainform vs ngram selector
ngform_i
=
None
if
not
groupings_id
:
ngform_i
=
NodeNgram
.
ngram_id
else
:
# prepare translations
syno
=
(
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
groupings_id
)
.
subquery
()
)
# cf commentaire détaillé dans compute_occs() + todo facto
ngform_i
=
case
([
(
syno
.
c
.
ngram1_id
!=
None
,
syno
.
c
.
ngram1_id
),
(
syno
.
c
.
ngram1_id
==
None
,
NodeNgram
.
ngram_id
)
# condition value
])
# MAIN QUERY SKELETON
# MAIN QUERY SKELETON
tf_nd_query
=
(
session
tf_nd_query
=
(
session
.
query
(
.
query
(
NodeNgram
.
ngram_id
,
# NodeNgram.ngram_id
# or similar if grouping ngrams under their mainform
ngform_i
.
label
(
"counted_ngform"
),
# the tfidf elements
# ------------------
func
.
sum
(
NodeNgram
.
weight
),
# tf: same as occurrences
func
.
sum
(
NodeNgram
.
weight
),
# tf: same as occurrences
# -----------------------
# -----------------------
func
.
count
(
NodeNgram
.
node_id
)
# nd: n docs with term
func
.
count
(
NodeNgram
.
node_id
)
# nd: n docs with term
# --------------------
# --------------------
)
)
.
group_by
(
NodeNgram
.
ngram_id
)
.
group_by
(
"counted_ngform"
)
#
optional *count_scope*: if we'll restrict the doc nodes
#
count_scope to specify in which doc nodes to count
#
--
-----------
# -----------
# .join(countdocs_subquery,
# .join(countdocs_subquery,
# countdocs_subquery.c.id == NodeNgram.node_id)
# countdocs_subquery.c.id == NodeNgram.node_id)
# optional
*termset_scope*
: if we'll restrict the ngrams
# optional
termset_scope
: if we'll restrict the ngrams
# -------------
--
# -------------
# .join(termset_subquery,
# .join(termset_subquery,
# termset_subquery.c.uniq_ngid == NodeNgram.ngram_id)
# termset_subquery.c.uniq_ngid == NodeNgram.ngram_id)
# optional translations to bring the subform's replacement
# ------------
# .outerjoin(syno,
# syno.c.ngram2_id == NodeNgram.ngram_id)
)
)
# validate string params
if
count_scope
not
in
[
"local"
,
"global"
]:
raise
ValueError
(
"compute_ti_ranking: count_scope param allowed values: 'local', 'global'"
)
# TUNING THE QUERY
if
termset_scope
not
in
[
"local"
,
"global"
]:
raise
ValueError
(
"compute_ti_ranking: termset_scope param allowed values: 'local', 'global'"
)
if
groupings_id
:
if
count_scope
==
"local"
and
termset_scope
==
"global"
:
tf_nd_query
=
tf_nd_query
.
outerjoin
(
raise
ValueError
(
"compute_ti_ranking: the termset_scope param can be 'global' iff count_scope param is 'global' too."
)
syno
,
syno
.
c
.
ngram2_id
==
NodeNgram
.
ngram_id
)
# local <=> within this corpus
# local <=> within this corpus
if
count_scope
==
"local"
:
if
count_scope
==
"local"
:
...
@@ -197,14 +264,14 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
...
@@ -197,14 +264,14 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
countdocs_subquery
=
(
session
countdocs_subquery
=
(
session
.
query
(
Node
.
id
)
.
query
(
Node
.
id
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
_
id
)
.
subquery
()
.
subquery
()
)
)
#
both scopes are the same:
no need to independantly restrict the ngrams
# no need to independantly restrict the ngrams
tf_nd_query
=
tf_nd_query
.
join
(
countdocs_subquery
,
tf_nd_query
=
tf_nd_query
.
join
(
countdocs_subquery
,
countdocs_subquery
.
c
.
id
==
NodeNgram
.
node_id
)
countdocs_subquery
.
c
.
id
==
NodeNgram
.
node_id
)
# ---
# global <=> within all corpora of this source
# global <=> within all corpora of this source
elif
count_scope
==
"global"
:
elif
count_scope
==
"global"
:
...
@@ -220,6 +287,7 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
...
@@ -220,6 +287,7 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
# join on parent_id with selected corpora nodes
# join on parent_id with selected corpora nodes
.
join
(
CorpusNode
,
CorpusNode
.
id
==
Node
.
parent_id
)
.
join
(
CorpusNode
,
CorpusNode
.
id
==
Node
.
parent_id
)
.
filter
(
CorpusNode
.
typename
==
"CORPUS"
)
.
filter
(
CorpusNode
.
typename
==
"CORPUS"
)
# TODO index corpus_sourcetype in DB
.
filter
(
CorpusNode
.
hyperdata
[
'resources'
][
0
][
'type'
]
.
astext
==
str
(
this_source_type
))
.
filter
(
CorpusNode
.
hyperdata
[
'resources'
][
0
][
'type'
]
.
astext
==
str
(
this_source_type
))
.
subquery
()
.
subquery
()
)
)
...
@@ -228,15 +296,19 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
...
@@ -228,15 +296,19 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
# both scopes are the same: no need to independantly restrict the ngrams
# both scopes are the same: no need to independantly restrict the ngrams
tf_nd_query
=
tf_nd_query
.
join
(
countdocs_subquery
,
tf_nd_query
=
tf_nd_query
.
join
(
countdocs_subquery
,
countdocs_subquery
.
c
.
id
==
NodeNgram
.
node_id
)
countdocs_subquery
.
c
.
id
==
NodeNgram
.
node_id
)
# ---
elif
termset_scope
==
"local"
:
elif
termset_scope
==
"local"
:
# All unique terms
in the original corpus
# All unique terms
...
termset_subquery
=
(
session
termset_subquery
=
(
session
.
query
(
distinct
(
NodeNgram
.
ngram_id
)
.
label
(
"uniq_ngid"
))
.
query
(
distinct
(
NodeNgram
.
ngram_id
)
.
label
(
"uniq_ngid"
)
)
# ... in the original corpus
.
join
(
Node
)
.
join
(
Node
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
_
id
)
.
subquery
()
.
subquery
()
)
)
...
@@ -247,42 +319,59 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
...
@@ -247,42 +319,59 @@ def compute_ti_ranking(corpus, count_scope="local", termset_scope="local", overw
.
join
(
termset_subquery
,
.
join
(
termset_subquery
,
termset_subquery
.
c
.
uniq_ngid
==
NodeNgram
.
ngram_id
)
termset_subquery
.
c
.
uniq_ngid
==
NodeNgram
.
ngram_id
)
)
)
# ---
#
N
#
M
total_docs
=
session
.
query
(
countdocs_subquery
)
.
count
()
total_docs
=
session
.
query
(
countdocs_subquery
)
.
count
()
log_tot_docs
=
log
(
total_docs
)
# result
# result
tf_nd
=
tf_nd_query
.
all
()
tf_nd
=
tf_nd_query
.
all
()
# -------------------------------------------------
# -------------- "sommatoire" sur mot i ----------------
tfidfs
=
{}
tfidfsum
=
{}
log_tot_docs
=
log
(
total_docs
)
for
(
ngram_i
,
tf_i
,
nd_i
)
in
tf_nd
:
for
(
ngram_id
,
tf
,
nd
)
in
tf_nd
:
# tfidfsum[ngram_i] = tf_i * log(total_docs/nd_i)
# tfidfs[ngram_id] = tf * log(total_docs/nd)
tfidfsum
[
ngram_i
]
=
tf_i
*
(
log_tot_docs
-
log
(
nd_i
))
tfidfs
[
ngram_id
]
=
tf
*
(
log_tot_docs
-
log
(
nd
))
# ------------------------------------------------------
# -------------------------------------------------
# N pour info
total_ngramforms
=
len
(
tfidfsum
)
if
overwrite_id
:
if
overwrite_id
:
the_id
=
overwrite_id
the_id
=
overwrite_id
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
node1_id
==
the_id
)
.
delete
()
session
.
commit
()
else
:
else
:
# create the new TFIDF-XXXX node
# create the new TFIDF-XXXX node to get an id
tfidf_nd
=
corpus
.
add_child
()
tir_nd
=
corpus
.
add_child
()
if
count_scope
==
"local"
:
# TODO discuss use and find new typename
if
count_scope
==
"local"
:
tfidf_nd
.
typename
=
"TFIDF-CORPUS"
tir_nd
.
typename
=
"TFIDF-CORPUS"
tfidf_nd
.
name
=
"tfidf-cumul-corpus (in:
%
s)"
%
corpus
.
id
tir_nd
.
name
=
"ti rank (
%
i ngforms in corpus:
%
s)"
%
(
total_ngramforms
,
corpus_id
)
elif
count_scope
==
"global"
:
elif
count_scope
==
"global"
:
tfidf_nd
.
typename
=
"TFIDF-GLOBAL"
tir_nd
.
typename
=
"TFIDF-GLOBAL"
tfidf_nd
.
name
=
"tfidf-cumul-global (in type:
%
s)"
%
this_source_type
tir_nd
.
name
=
"ti rank (
%
i ngforms
%
s in corpora of sourcetype:
%
s)"
%
(
session
.
add
(
tfidf_nd
)
total_ngramforms
,
(
"from corpus
%
i"
%
corpus_id
)
if
(
termset_scope
==
"local"
)
else
""
,
this_source_type
)
session
.
add
(
tir_nd
)
session
.
commit
()
session
.
commit
()
the_id
=
tfidf_nd
.
id
the_id
=
tir_nd
.
id
# TODO 1 discuss use and find new typename
# TODO 2 release these 2 typenames TFIDF-CORPUS and TFIDF-GLOBAL
# TODO 3 recreate them elsewhere in their sims (WeightedIndex) version
# TODO 4 requalify this here as a NodeNgram
# then TODO 5 use WeightedList.save() !
# reflect that in NodeNodeNgrams
# reflect that in NodeNodeNgrams
# £TODO replace bulk_insert by something like WeightedContextMatrix.save()
bulk_insert
(
bulk_insert
(
NodeNodeNgram
,
NodeNodeNgram
,
(
'node1_id'
,
'node2_id'
,
'ngram_id'
,
'score'
),
(
'node1_id'
,
'node2_id'
,
'ngram_id'
,
'score'
),
((
the_id
,
corpus
.
id
,
ng
,
tfidfs
[
ng
])
for
ng
in
tfidfs
)
((
the_id
,
corpus_id
,
ng
,
tfidfsum
[
ng
])
for
ng
in
tfidfsum
)
)
)
return
the_id
return
the_id
...
@@ -347,6 +436,8 @@ def compute_tfidf_local(corpus, overwrite_id=None):
...
@@ -347,6 +436,8 @@ def compute_tfidf_local(corpus, overwrite_id=None):
if
overwrite_id
:
if
overwrite_id
:
the_id
=
overwrite_id
the_id
=
overwrite_id
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
node1_id
==
the_id
)
.
delete
()
session
.
commit
()
else
:
else
:
# create the new TFIDF-CORPUS node
# create the new TFIDF-CORPUS node
tfidf_node
=
corpus
.
add_child
()
tfidf_node
=
corpus
.
add_child
()
...
@@ -357,7 +448,7 @@ def compute_tfidf_local(corpus, overwrite_id=None):
...
@@ -357,7 +448,7 @@ def compute_tfidf_local(corpus, overwrite_id=None):
the_id
=
tfidf_node
.
id
the_id
=
tfidf_node
.
id
# reflect that in NodeNodeNgrams
# reflect that in NodeNodeNgrams
# £TODO replace bulk_insert by something like Weighted
ContextMatri
x.save()
# £TODO replace bulk_insert by something like Weighted
Inde
x.save()
bulk_insert
(
bulk_insert
(
NodeNodeNgram
,
NodeNodeNgram
,
(
'node1_id'
,
'node2_id'
,
'ngram_id'
,
'score'
),
(
'node1_id'
,
'node2_id'
,
'ngram_id'
,
'score'
),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment