Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
7cea952c
Commit
7cea952c
authored
May 22, 2016
by
Romain Loth
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
FIX unnecessary writing of big cooc matrix used once just in specif + update maplist
parent
51bc0bf5
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
43 additions
and
30 deletions
+43
-30
__init__.py
gargantext/util/toolchain/__init__.py
+11
-9
list_map.py
gargantext/util/toolchain/list_map.py
+14
-11
metric_specificity.py
gargantext/util/toolchain/metric_specificity.py
+18
-10
No files found.
gargantext/util/toolchain/__init__.py
View file @
7cea952c
...
...
@@ -136,21 +136,23 @@ def parse_extract_indexhyperdata(corpus):
# ------------
# -> cooccurrences on mainlist: compute + write (=> Node and NodeNgramNgram)
# todo: no need to write it ?
cooc_id
=
compute_coocs
(
corpus
,
on_list_id
=
mainlist_id
,
groupings_id
=
group_id
)
print
(
'CORPUS #
%
d: [
%
s] new coocs node #
%
i'
%
(
corpus
.
id
,
t
(),
cooc_id
))
coocs
=
compute_coocs
(
corpus
,
on_list_id
=
mainlist_id
,
groupings_id
=
group_id
,
just_pass_result
=
True
)
print
(
'CORPUS #
%
d: [
%
s] computed mainlist coocs for specif rank'
%
(
corpus
.
id
,
t
()))
# -> specificity: compute + write (=> NodeNodeNgram)
spec_id
=
compute_specificity
(
corpus
,
cooc_id
=
cooc_id
# ,groupings_id = group_id
)
spec_id
=
compute_specificity
(
corpus
,
cooc_matrix
=
coocs
)
# no need here for subforms because cooc already counted them in mainform
print
(
'CORPUS #
%
d: [
%
s] new specificity node #
%
i'
%
(
corpus
.
id
,
t
(),
spec_id
))
#
??
maplist: compute + write (to Node and NodeNgram)
# maplist: compute + write (to Node and NodeNgram)
map_id
=
do_maplist
(
corpus
,
mainlist_id
=
mainlist_id
,
specificity_id
=
spec_id
,
grouplist_id
=
group_id
)
grouplist_id
=
group_id
)
print
(
'CORPUS #
%
d: [
%
s] new maplist node #
%
i'
%
(
corpus
.
id
,
t
(),
map_id
))
print
(
'CORPUS #
%
d: [
%
s] FINISHED ngram lists computation'
%
(
corpus
.
id
,
t
()))
...
...
@@ -161,7 +163,7 @@ def parse_extract_indexhyperdata(corpus):
if
DEBUG
is
False
:
print
(
'CORPUS #
%
d: [
%
s] FINISHED Sendin
d
email notification'
%
(
corpus
.
id
,
t
()))
print
(
'CORPUS #
%
d: [
%
s] FINISHED Sendin
g
email notification'
%
(
corpus
.
id
,
t
()))
notify_owner
(
corpus
)
corpus
.
status
(
'Workflow'
,
progress
=
10
,
complete
=
True
)
...
...
gargantext/util/toolchain/list_map.py
View file @
7cea952c
...
...
@@ -43,15 +43,11 @@ def do_maplist(corpus,
#dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
mainterms_subquery
=
(
session
# we want only terms within mainlist
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
mainlist_id
)
.
subquery
()
)
MainlistTable
=
aliased
(
NodeNgram
)
primary_groupterms_subquery
=
(
session
# we want only primary terms (ngram1)
IsSubform
=
(
session
# we want only secondary terms (ngram2)
# to be able to filter them out
.
query
(
NodeNgramNgram
.
ngram2_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
grouplist_id
)
.
subquery
()
...
...
@@ -63,8 +59,15 @@ def do_maplist(corpus,
query
=
(
session
.
query
(
ScoreSpec
.
ngram_id
)
.
join
(
Ngram
,
Ngram
.
id
==
ScoreSpec
.
ngram_id
)
.
filter
(
ScoreSpec
.
node_id
==
specificity_id
)
.
filter
(
ScoreSpec
.
ngram_id
.
in_
(
mainterms_subquery
))
.
filter
(
ScoreSpec
.
ngram_id
.
notin_
(
primary_groupterms_subquery
))
# we want only terms within mainlist
.
join
(
MainlistTable
,
Ngram
.
id
==
MainlistTable
.
ngram_id
)
.
filter
(
MainlistTable
.
node_id
==
mainlist_id
)
# we remove all ngrams matching an ngram2_id from the synonyms
.
outerjoin
(
IsSubform
,
IsSubform
.
c
.
ngram2_id
==
ScoreSpec
.
ngram_id
)
.
filter
(
IsSubform
.
c
.
ngram2_id
==
None
)
)
# TODO: move these 2 pools up to mainlist selection
...
...
@@ -94,7 +97,7 @@ def do_maplist(corpus,
new_hyperdata
=
{
'corpus'
:
corpus
.
id
,
'limit'
:
limit
,
'monograms_part'
:
monograms_part
,
'monograms_result'
:
obtained_mono
/
obtained_total
if
obtained_total
!=
0
else
obtained_mono
'monograms_result'
:
obtained_mono
/
obtained_total
if
obtained_total
!=
0
else
0
}
if
overwrite_id
:
# overwrite pre-existing node
...
...
gargantext/util/toolchain/metric_specificity.py
View file @
7cea952c
...
...
@@ -9,7 +9,7 @@ from collections import defaultdict
from
pandas
import
DataFrame
import
pandas
as
pd
def
compute_specificity
(
corpus
,
cooc_id
=
None
,
overwrite_id
=
None
):
def
compute_specificity
(
corpus
,
cooc_id
=
None
,
cooc_matrix
=
None
,
overwrite_id
=
None
):
'''
Compute the specificity, simple calculus.
...
...
@@ -18,18 +18,26 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
- overwrite_id: optional preexisting specificity node to overwrite
'''
matrix
=
defaultdict
(
lambda
:
defaultdict
(
float
))
if
cooc_id
==
None
and
cooc_matrix
==
None
:
raise
TypeError
(
"compute_specificity: needs a cooc_id or cooc_matrix param"
)
elif
cooc_id
:
cooccurrences
=
(
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
cooc_id
)
)
# no filtering: new choice cooc already filtered on tfidf before creation
matrix
=
defaultdict
(
lambda
:
defaultdict
(
float
))
# £TODO re-rename weight => score
# no filtering: cooc already filtered on mainlist_id at creation
for
cooccurrence
in
cooccurrences
:
matrix
[
cooccurrence
.
ngram1_id
][
cooccurrence
.
ngram2_id
]
=
cooccurrence
.
weight
matrix
[
cooccurrence
.
ngram2_id
][
cooccurrence
.
ngram1_id
]
=
cooccurrence
.
weight
elif
cooc_matrix
:
# copy WeightedMatrix into local matrix structure
for
(
ngram1_id
,
ngram2_id
)
in
cooc_matrix
.
items
:
w
=
cooc_matrix
.
items
[(
ngram1_id
,
ngram2_id
)]
matrix
[
ngram1_id
][
ngram2_id
]
=
w
nb_ngrams
=
len
(
matrix
)
print
(
"SPECIFICITY: computing on
%
i ngrams"
%
nb_ngrams
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment