Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
51bc0bf5
Commit
51bc0bf5
authored
May 22, 2016
by
Romain Loth
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add option to not write coocs but just pass the matrix result
parent
e52afd97
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
46 additions
and
37 deletions
+46
-37
__init__.py
gargantext/util/toolchain/__init__.py
+4
-3
ngram_coocs.py
gargantext/util/toolchain/ngram_coocs.py
+42
-34
No files found.
gargantext/util/toolchain/__init__.py
View file @
51bc0bf5
...
@@ -6,12 +6,12 @@ from .hyperdata_indexing import index_hyperdata
...
@@ -6,12 +6,12 @@ from .hyperdata_indexing import index_hyperdata
# in usual run order
# in usual run order
from
.list_stop
import
do_stoplist
from
.list_stop
import
do_stoplist
from
.ngram_groups
import
compute_groups
from
.metric_tfidf
import
compute_occs
,
compute_tfidf_local
,
compute_ti_ranking
from
.metric_tfidf
import
compute_occs
,
compute_tfidf_local
,
compute_ti_ranking
from
.list_main
import
do_mainlist
from
.list_main
import
do_mainlist
from
.ngram_coocs
import
compute_coocs
from
.ngram_coocs
import
compute_coocs
from
.metric_specificity
import
compute_specificity
from
.metric_specificity
import
compute_specificity
from
.list_map
import
do_maplist
# TEST
from
.list_map
import
do_maplist
# TEST
from
.ngram_groups
import
compute_groups
from
.mail_notification
import
notify_owner
from
.mail_notification
import
notify_owner
from
gargantext.util.db
import
session
from
gargantext.util.db
import
session
from
gargantext.models
import
Node
from
gargantext.models
import
Node
...
@@ -135,8 +135,9 @@ def parse_extract_indexhyperdata(corpus):
...
@@ -135,8 +135,9 @@ def parse_extract_indexhyperdata(corpus):
# => used for doc <=> ngram association
# => used for doc <=> ngram association
# ------------
# ------------
# -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
# -> cooccurrences on mainlist: compute + write (=> Node and NodeNgramNgram)
cooc_id
=
compute_coocs
(
corpus
,
mainlist_id
=
mainlist_id
,
groupings_id
=
group_id
)
# todo: no need to write it ?
cooc_id
=
compute_coocs
(
corpus
,
on_list_id
=
mainlist_id
,
groupings_id
=
group_id
)
print
(
'CORPUS #
%
d: [
%
s] new coocs node #
%
i'
%
(
corpus
.
id
,
t
(),
cooc_id
))
print
(
'CORPUS #
%
d: [
%
s] new coocs node #
%
i'
%
(
corpus
.
id
,
t
(),
cooc_id
))
# -> specificity: compute + write (=> NodeNodeNgram)
# -> specificity: compute + write (=> NodeNodeNgram)
...
...
gargantext/util/toolchain/ngram_coocs.py
View file @
51bc0bf5
...
@@ -10,9 +10,11 @@ from sqlalchemy.sql.expression import case # for choice if ngram has mainform or
...
@@ -10,9 +10,11 @@ from sqlalchemy.sql.expression import case # for choice if ngram has mainform or
def
compute_coocs
(
corpus
,
def
compute_coocs
(
corpus
,
overwrite_id
=
None
,
overwrite_id
=
None
,
just_pass_result
=
True
,
# just return the WeightedMatrix,
# (don't write to DB)
threshold
=
DEFAULT_COOC_THRESHOLD
,
threshold
=
DEFAULT_COOC_THRESHOLD
,
groupings_id
=
None
,
groupings_id
=
None
,
mainlist_id
=
None
,
on_list_id
=
None
,
stoplist_id
=
None
,
stoplist_id
=
None
,
start
=
None
,
start
=
None
,
end
=
None
,
end
=
None
,
...
@@ -46,7 +48,7 @@ def compute_coocs( corpus,
...
@@ -46,7 +48,7 @@ def compute_coocs( corpus,
- threshold: on output cooc count (previously called hapax)
- threshold: on output cooc count (previously called hapax)
- groupings_id: optional synonym relations to add all subform counts
- groupings_id: optional synonym relations to add all subform counts
with their mainform's counts
with their mainform's counts
-
mainlist_id: mainlist
to constrain the input ngrams
-
on_list_id: mainlist or maplist type,
to constrain the input ngrams
- stoplist_id: stoplist for filtering input ngrams
- stoplist_id: stoplist for filtering input ngrams
(normally unnecessary if a mainlist is already provided)
(normally unnecessary if a mainlist is already provided)
- start, end: provide one or both temporal limits to filter on doc date
- start, end: provide one or both temporal limits to filter on doc date
...
@@ -62,9 +64,10 @@ def compute_coocs( corpus,
...
@@ -62,9 +64,10 @@ def compute_coocs( corpus,
======================
======================
each pair of ngrams sharing same doc (node_id)
each pair of ngrams sharing same doc (node_id)
SELEC idxa.ngram_id, idxb.ngram_id
SELEC idxa.ngram_id, idxb.ngram_id
FROM nodes_ngrams AS idxa
, nodes_ngrams AS idxb
FROM nodes_ngrams AS idxa
---------------------------------
---------------------------------
WHERE idxa.node_id = idxb.node_id <== that's cooc
JOIN nodes_ngrams AS idxb
ON idxa.node_id = idxb.node_id <== that's cooc
---------------------------------
---------------------------------
AND idxa.ngram_id <> idxb.ngram_id
AND idxa.ngram_id <> idxb.ngram_id
AND idxa.node_id = MY_DOC ;
AND idxa.node_id = MY_DOC ;
...
@@ -188,7 +191,7 @@ def compute_coocs( corpus,
...
@@ -188,7 +191,7 @@ def compute_coocs( corpus,
# 4) INPUT FILTERS (reduce N before O(N²))
# 4) INPUT FILTERS (reduce N before O(N²))
if
main
list_id
:
if
on_
list_id
:
m1
=
aliased
(
NodeNgram
)
m1
=
aliased
(
NodeNgram
)
m2
=
aliased
(
NodeNgram
)
m2
=
aliased
(
NodeNgram
)
...
@@ -197,8 +200,8 @@ def compute_coocs( corpus,
...
@@ -197,8 +200,8 @@ def compute_coocs( corpus,
.
join
(
m1
,
m1
.
ngram_id
==
Xindex_ngform_id
)
.
join
(
m1
,
m1
.
ngram_id
==
Xindex_ngform_id
)
.
join
(
m2
,
m2
.
ngram_id
==
Yindex_ngform_id
)
.
join
(
m2
,
m2
.
ngram_id
==
Yindex_ngform_id
)
.
filter
(
m1
.
node_id
==
main
list_id
)
.
filter
(
m1
.
node_id
==
on_
list_id
)
.
filter
(
m2
.
node_id
==
main
list_id
)
.
filter
(
m2
.
node_id
==
on_
list_id
)
)
)
if
stoplist_id
:
if
stoplist_id
:
...
@@ -279,31 +282,36 @@ def compute_coocs( corpus,
...
@@ -279,31 +282,36 @@ def compute_coocs( corpus,
shape_1
=
len
({
pair
[
1
]
for
pair
in
matrix
.
items
})
shape_1
=
len
({
pair
[
1
]
for
pair
in
matrix
.
items
})
print
(
"COOCS: NEW matrix shape [
%
ix
%
i]"
%
(
shape_0
,
shape_1
))
print
(
"COOCS: NEW matrix shape [
%
ix
%
i]"
%
(
shape_0
,
shape_1
))
# 5) SAVE
# --------
# saving the parameters of the analysis in the Node JSON
new_hyperdata
=
{
'corpus'
:
corpus
.
id
,
'threshold'
:
threshold
}
if
overwrite_id
:
# overwrite pre-existing id
the_cooc
=
cache
.
Node
[
overwrite_id
]
the_cooc
.
hyperdata
=
new_hyperdata
the_cooc
.
save_hyperdata
()
session
.
commit
()
the_id
=
overwrite_id
else
:
# create the new cooc node
the_cooc
=
corpus
.
add_child
(
typename
=
"COOCCURRENCES"
,
name
=
"Coocs (in:
%
s)"
%
corpus
.
name
[
0
:
10
],
hyperdata
=
new_hyperdata
,
)
session
.
add
(
the_cooc
)
session
.
commit
()
the_id
=
the_cooc
.
id
# ==> save all NodeNgramNgram with link to new cooc node id
if
just_pass_result
:
matrix
.
save
(
the_id
)
return
matrix
else
:
return
the_id
# 5) SAVE
# --------
# saving the parameters of the analysis in the Node JSON
new_hyperdata
=
{
'corpus'
:
corpus
.
id
,
'threshold'
:
threshold
}
if
overwrite_id
:
# overwrite pre-existing id
the_cooc
=
cache
.
Node
[
overwrite_id
]
the_cooc
.
hyperdata
=
new_hyperdata
the_cooc
.
save_hyperdata
()
session
.
commit
()
the_id
=
overwrite_id
else
:
# create the new cooc node
the_cooc
=
corpus
.
add_child
(
typename
=
"COOCCURRENCES"
,
name
=
"Coocs (in:
%
s)"
%
corpus
.
name
[
0
:
10
],
hyperdata
=
new_hyperdata
,
)
session
.
add
(
the_cooc
)
session
.
commit
()
the_id
=
the_cooc
.
id
# ==> save all NodeNgramNgram with link to new cooc node id
matrix
.
save
(
the_id
)
return
the_id
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment