Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
7cea952c
Commit
7cea952c
authored
May 22, 2016
by
Romain Loth
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
FIX unnecessary writing of big cooc matrix used once just in specif + update maplist
parent
51bc0bf5
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
43 additions
and
30 deletions
+43
-30
__init__.py
gargantext/util/toolchain/__init__.py
+11
-9
list_map.py
gargantext/util/toolchain/list_map.py
+14
-11
metric_specificity.py
gargantext/util/toolchain/metric_specificity.py
+18
-10
No files found.
gargantext/util/toolchain/__init__.py
View file @
7cea952c
...
@@ -136,21 +136,23 @@ def parse_extract_indexhyperdata(corpus):
...
@@ -136,21 +136,23 @@ def parse_extract_indexhyperdata(corpus):
# ------------
# ------------
# -> cooccurrences on mainlist: compute + write (=> Node and NodeNgramNgram)
# -> cooccurrences on mainlist: compute + write (=> Node and NodeNgramNgram)
# todo: no need to write it ?
coocs
=
compute_coocs
(
corpus
,
cooc_id
=
compute_coocs
(
corpus
,
on_list_id
=
mainlist_id
,
groupings_id
=
group_id
)
on_list_id
=
mainlist_id
,
print
(
'CORPUS #
%
d: [
%
s] new coocs node #
%
i'
%
(
corpus
.
id
,
t
(),
cooc_id
))
groupings_id
=
group_id
,
just_pass_result
=
True
)
print
(
'CORPUS #
%
d: [
%
s] computed mainlist coocs for specif rank'
%
(
corpus
.
id
,
t
()))
# -> specificity: compute + write (=> NodeNodeNgram)
# -> specificity: compute + write (=> NodeNodeNgram)
spec_id
=
compute_specificity
(
corpus
,
cooc_id
=
cooc_id
spec_id
=
compute_specificity
(
corpus
,
cooc_matrix
=
coocs
)
# ,groupings_id = group_id
# no need here for subforms because cooc already counted them in mainform
)
print
(
'CORPUS #
%
d: [
%
s] new specificity node #
%
i'
%
(
corpus
.
id
,
t
(),
spec_id
))
print
(
'CORPUS #
%
d: [
%
s] new specificity node #
%
i'
%
(
corpus
.
id
,
t
(),
spec_id
))
#
??
maplist: compute + write (to Node and NodeNgram)
# maplist: compute + write (to Node and NodeNgram)
map_id
=
do_maplist
(
corpus
,
map_id
=
do_maplist
(
corpus
,
mainlist_id
=
mainlist_id
,
mainlist_id
=
mainlist_id
,
specificity_id
=
spec_id
,
specificity_id
=
spec_id
,
grouplist_id
=
group_id
)
grouplist_id
=
group_id
)
print
(
'CORPUS #
%
d: [
%
s] new maplist node #
%
i'
%
(
corpus
.
id
,
t
(),
map_id
))
print
(
'CORPUS #
%
d: [
%
s] new maplist node #
%
i'
%
(
corpus
.
id
,
t
(),
map_id
))
print
(
'CORPUS #
%
d: [
%
s] FINISHED ngram lists computation'
%
(
corpus
.
id
,
t
()))
print
(
'CORPUS #
%
d: [
%
s] FINISHED ngram lists computation'
%
(
corpus
.
id
,
t
()))
...
@@ -161,7 +163,7 @@ def parse_extract_indexhyperdata(corpus):
...
@@ -161,7 +163,7 @@ def parse_extract_indexhyperdata(corpus):
if
DEBUG
is
False
:
if
DEBUG
is
False
:
print
(
'CORPUS #
%
d: [
%
s] FINISHED Sendin
d
email notification'
%
(
corpus
.
id
,
t
()))
print
(
'CORPUS #
%
d: [
%
s] FINISHED Sendin
g
email notification'
%
(
corpus
.
id
,
t
()))
notify_owner
(
corpus
)
notify_owner
(
corpus
)
corpus
.
status
(
'Workflow'
,
progress
=
10
,
complete
=
True
)
corpus
.
status
(
'Workflow'
,
progress
=
10
,
complete
=
True
)
...
...
gargantext/util/toolchain/list_map.py
View file @
7cea952c
...
@@ -43,15 +43,11 @@ def do_maplist(corpus,
...
@@ -43,15 +43,11 @@ def do_maplist(corpus,
#dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
#dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
mainterms_subquery
=
(
session
MainlistTable
=
aliased
(
NodeNgram
)
# we want only terms within mainlist
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
mainlist_id
)
.
subquery
()
)
primary_groupterms_subquery
=
(
session
IsSubform
=
(
session
# we want only primary terms (ngram1)
# we want only secondary terms (ngram2)
# to be able to filter them out
.
query
(
NodeNgramNgram
.
ngram2_id
)
.
query
(
NodeNgramNgram
.
ngram2_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
grouplist_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
grouplist_id
)
.
subquery
()
.
subquery
()
...
@@ -63,8 +59,15 @@ def do_maplist(corpus,
...
@@ -63,8 +59,15 @@ def do_maplist(corpus,
query
=
(
session
.
query
(
ScoreSpec
.
ngram_id
)
query
=
(
session
.
query
(
ScoreSpec
.
ngram_id
)
.
join
(
Ngram
,
Ngram
.
id
==
ScoreSpec
.
ngram_id
)
.
join
(
Ngram
,
Ngram
.
id
==
ScoreSpec
.
ngram_id
)
.
filter
(
ScoreSpec
.
node_id
==
specificity_id
)
.
filter
(
ScoreSpec
.
node_id
==
specificity_id
)
.
filter
(
ScoreSpec
.
ngram_id
.
in_
(
mainterms_subquery
))
.
filter
(
ScoreSpec
.
ngram_id
.
notin_
(
primary_groupterms_subquery
))
# we want only terms within mainlist
.
join
(
MainlistTable
,
Ngram
.
id
==
MainlistTable
.
ngram_id
)
.
filter
(
MainlistTable
.
node_id
==
mainlist_id
)
# we remove all ngrams matching an ngram2_id from the synonyms
.
outerjoin
(
IsSubform
,
IsSubform
.
c
.
ngram2_id
==
ScoreSpec
.
ngram_id
)
.
filter
(
IsSubform
.
c
.
ngram2_id
==
None
)
)
)
# TODO: move these 2 pools up to mainlist selection
# TODO: move these 2 pools up to mainlist selection
...
@@ -94,7 +97,7 @@ def do_maplist(corpus,
...
@@ -94,7 +97,7 @@ def do_maplist(corpus,
new_hyperdata
=
{
'corpus'
:
corpus
.
id
,
new_hyperdata
=
{
'corpus'
:
corpus
.
id
,
'limit'
:
limit
,
'limit'
:
limit
,
'monograms_part'
:
monograms_part
,
'monograms_part'
:
monograms_part
,
'monograms_result'
:
obtained_mono
/
obtained_total
if
obtained_total
!=
0
else
obtained_mono
'monograms_result'
:
obtained_mono
/
obtained_total
if
obtained_total
!=
0
else
0
}
}
if
overwrite_id
:
if
overwrite_id
:
# overwrite pre-existing node
# overwrite pre-existing node
...
...
gargantext/util/toolchain/metric_specificity.py
View file @
7cea952c
...
@@ -9,7 +9,7 @@ from collections import defaultdict
...
@@ -9,7 +9,7 @@ from collections import defaultdict
from
pandas
import
DataFrame
from
pandas
import
DataFrame
import
pandas
as
pd
import
pandas
as
pd
def
compute_specificity
(
corpus
,
cooc_id
=
None
,
overwrite_id
=
None
):
def
compute_specificity
(
corpus
,
cooc_id
=
None
,
cooc_matrix
=
None
,
overwrite_id
=
None
):
'''
'''
Compute the specificity, simple calculus.
Compute the specificity, simple calculus.
...
@@ -18,17 +18,25 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
...
@@ -18,17 +18,25 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
- overwrite_id: optional preexisting specificity node to overwrite
- overwrite_id: optional preexisting specificity node to overwrite
'''
'''
cooccurrences
=
(
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
cooc_id
)
)
# no filtering: new choice cooc already filtered on tfidf before creation
matrix
=
defaultdict
(
lambda
:
defaultdict
(
float
))
matrix
=
defaultdict
(
lambda
:
defaultdict
(
float
))
# £TODO re-rename weight => score
if
cooc_id
==
None
and
cooc_matrix
==
None
:
for
cooccurrence
in
cooccurrences
:
raise
TypeError
(
"compute_specificity: needs a cooc_id or cooc_matrix param"
)
matrix
[
cooccurrence
.
ngram1_id
][
cooccurrence
.
ngram2_id
]
=
cooccurrence
.
weight
matrix
[
cooccurrence
.
ngram2_id
][
cooccurrence
.
ngram1_id
]
=
cooccurrence
.
weight
elif
cooc_id
:
cooccurrences
=
(
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
cooc_id
)
)
# no filtering: cooc already filtered on mainlist_id at creation
for
cooccurrence
in
cooccurrences
:
matrix
[
cooccurrence
.
ngram1_id
][
cooccurrence
.
ngram2_id
]
=
cooccurrence
.
weight
matrix
[
cooccurrence
.
ngram2_id
][
cooccurrence
.
ngram1_id
]
=
cooccurrence
.
weight
elif
cooc_matrix
:
# copy WeightedMatrix into local matrix structure
for
(
ngram1_id
,
ngram2_id
)
in
cooc_matrix
.
items
:
w
=
cooc_matrix
.
items
[(
ngram1_id
,
ngram2_id
)]
matrix
[
ngram1_id
][
ngram2_id
]
=
w
nb_ngrams
=
len
(
matrix
)
nb_ngrams
=
len
(
matrix
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment