Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
20e969ed
Commit
20e969ed
authored
Mar 30, 2016
by
delanoe
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'refactoring-rom' into refactoring-alex
parents
9ad61799
6341dc12
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
93 additions
and
10 deletions
+93
-10
__init__.py
gargantext/util/toolchain/__init__.py
+5
-5
metric_tfidf.py
gargantext/util/toolchain/metric_tfidf.py
+88
-5
No files found.
gargantext/util/toolchain/__init__.py
View file @
20e969ed
...
...
@@ -4,7 +4,7 @@ from .hyperdata_indexing import index_hyperdata
# in usual run order
from
.list_stop
import
do_stoplist
from
.metric_tfidf
import
compute_occs
,
compute_tfidf
from
.metric_tfidf
import
compute_occs
,
compute_tfidf
_local
,
compute_cumulated_tfidf
from
.list_main
import
do_mainlist
from
.ngram_coocs
import
compute_coocs
from
.metric_specificity
import
compute_specificity
...
...
@@ -75,12 +75,12 @@ def parse_extract_indexhyperdata(corpus):
print
(
'CORPUS #
%
d: [
%
s] new occs node #
%
i'
%
(
corpus
.
id
,
t
(),
occ_id
))
# ------------
# -> write local tfidf to Node and NodeNodeNgram
ltfidf_id
=
compute_tfidf
(
corpus
,
scope
=
"local"
)
# -> write local tfidf
similarities
to Node and NodeNodeNgram
ltfidf_id
=
compute_tfidf
_local
(
corpus
)
print
(
'CORPUS #
%
d: [
%
s] new localtfidf node #
%
i'
%
(
corpus
.
id
,
t
(),
ltfidf_id
))
# -> write global tfidf to Node and NodeNodeNgram
gtfidf_id
=
compute_tfidf
(
corpus
,
scope
=
"global"
)
# -> write global
and cumulated
tfidf to Node and NodeNodeNgram
gtfidf_id
=
compute_
cumulated_
tfidf
(
corpus
,
scope
=
"global"
)
print
(
'CORPUS #
%
d: [
%
s] new globaltfidf node #
%
i'
%
(
corpus
.
id
,
t
(),
gtfidf_id
))
# -> mainlist: filter + write (to Node and NodeNgram)
...
...
gargantext/util/toolchain/metric_tfidf.py
View file @
20e969ed
...
...
@@ -18,6 +18,8 @@ from math import log
def
compute_occs
(
corpus
,
overwrite_id
=
None
):
"""
# TODO check if cumulated occs correspond to app's use cases and intention
Calculates sum of occs per ngram within corpus
(used as info in the ngrams table view)
...
...
@@ -78,9 +80,11 @@ def compute_occs(corpus, overwrite_id = None):
return
the_id
def
compute_tfidf
(
corpus
,
scope
=
"local"
,
overwrite_id
=
None
):
def
compute_
cumulated_
tfidf
(
corpus
,
scope
=
"local"
,
overwrite_id
=
None
):
"""
Calculates tfidf within the current corpus
# TODO check if cumulated tfs correspond to app's use cases and intention
Calculates tfidf ranking (cumulated tfidf) within the given scope
Parameters:
- the corpus itself
...
...
@@ -150,12 +154,12 @@ def compute_tfidf(corpus, scope="local", overwrite_id=None):
else
:
# create the new TFIDF-XXXX node
tfidf_nd
=
corpus
.
add_child
()
if
scope
==
"local"
:
if
scope
==
"local"
:
# TODO discuss use and find new typename
tfidf_nd
.
typename
=
"TFIDF-CORPUS"
tfidf_nd
.
name
=
"tfidf-c (in:
%
s)"
%
corpus
.
id
tfidf_nd
.
name
=
"tfidf-c
umul-corpus
(in:
%
s)"
%
corpus
.
id
elif
scope
==
"global"
:
tfidf_nd
.
typename
=
"TFIDF-GLOBAL"
tfidf_nd
.
name
=
"tfidf-
g
(in type:
%
s)"
%
this_source_type
tfidf_nd
.
name
=
"tfidf-
cumul-global
(in type:
%
s)"
%
this_source_type
session
.
add
(
tfidf_nd
)
session
.
commit
()
the_id
=
tfidf_nd
.
id
...
...
@@ -169,3 +173,82 @@ def compute_tfidf(corpus, scope="local", overwrite_id=None):
)
return
the_id
def
compute_tfidf_local
(
corpus
,
overwrite_id
=
None
):
"""
Calculates tfidf similarity of each (doc, ngram) couple, within the current corpus
Parameters:
- the corpus itself
- overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
"""
# All docs of this corpus
docids_subquery
=
(
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
subquery
()
)
# N
total_docs
=
session
.
query
(
docids_subquery
)
.
count
()
# number of docs with given term (number of rows = M ngrams)
n_docswith_ng
=
(
session
.
query
(
NodeNgram
.
ngram_id
,
func
.
count
(
NodeNgram
.
node_id
)
.
label
(
"nd"
)
# nd: n docs with term
)
.
filter
(
NodeNgram
.
node_id
.
in_
(
docids_subquery
))
.
group_by
(
NodeNgram
.
ngram_id
)
.
all
()
)
# { ngram_id => log(nd) }
log_nd_lookup
=
{
row
.
ngram_id
:
log
(
row
.
nd
)
for
row
in
n_docswith_ng
}
# tf for each couple (number of rows = N docs X M ngrams)
tf_doc_ng
=
(
session
.
query
(
NodeNgram
.
ngram_id
,
NodeNgram
.
node_id
,
func
.
sum
(
NodeNgram
.
weight
)
.
label
(
"tf"
),
# tf: occurrences
)
.
filter
(
NodeNgram
.
node_id
.
in_
(
docids_subquery
))
.
group_by
(
NodeNgram
.
node_id
,
NodeNgram
.
ngram_id
)
.
all
()
)
# ---------------------------------------------------------
tfidfs
=
{}
log_tot_docs
=
log
(
total_docs
)
for
(
ngram_id
,
node_id
,
tf
)
in
tf_doc_ng
:
log_nd
=
log_nd_lookup
[
ngram_id
]
# tfidfs[ngram_id] = tf * log(total_docs/nd)
tfidfs
[
node_id
,
ngram_id
]
=
tf
*
(
log_tot_docs
-
log_nd
)
# ---------------------------------------------------------
if
overwrite_id
:
the_id
=
overwrite_id
else
:
# create the new TFIDF-CORPUS node
tfidf_node
=
corpus
.
add_child
()
tfidf_node
.
typename
=
"TFIDF-CORPUS"
tfidf_node
.
name
=
"tfidf-sims-corpus (in:
%
s)"
%
corpus
.
id
session
.
add
(
tfidf_node
)
session
.
commit
()
the_id
=
tfidf_node
.
id
# reflect that in NodeNodeNgrams
# £TODO replace bulk_insert by something like WeightedContextMatrix.save()
bulk_insert
(
NodeNodeNgram
,
(
'node1_id'
,
'node2_id'
,
'ngram_id'
,
'score'
),
((
the_id
,
node_id
,
ngram_id
,
tfidfs
[
node_id
,
ngram_id
])
for
(
node_id
,
ngram_id
)
in
tfidfs
)
)
return
the_id
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment