Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
75a7e329
Commit
75a7e329
authored
May 23, 2016
by
delanoe
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'romain-refactoring' into unstable
parents
6c438c85
eee27166
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
187 additions
and
93 deletions
+187
-93
ngram_parsing_flow.dot
doc/schemas/ngram_parsing_flow.dot
+8
-4
ngram_parsing_flow.png
doc/schemas/ngram_parsing_flow.png
+0
-0
lists.py
gargantext/util/lists.py
+5
-2
__init__.py
gargantext/util/toolchain/__init__.py
+16
-12
list_map.py
gargantext/util/toolchain/list_map.py
+14
-11
metric_specificity.py
gargantext/util/toolchain/metric_specificity.py
+18
-10
metric_tfidf.py
gargantext/util/toolchain/metric_tfidf.py
+83
-19
ngram_coocs.py
gargantext/util/toolchain/ngram_coocs.py
+43
-35
No files found.
doc/schemas/ngram_parsing_flow.dot
View file @
75a7e329
...
@@ -6,15 +6,19 @@ digraph ngramflow {
...
@@ -6,15 +6,19 @@ digraph ngramflow {
labelloc
=
"t"
;
labelloc
=
"t"
;
"extracted_ngrams"
->
"grouplist"
;
"extracted_ngrams"
->
"grouplist"
;
"extracted_ngrams"
->
"occs+t
fidfs
"
;
"extracted_ngrams"
->
"occs+t
i_rank
"
;
"
main_user_stoplist
"
->
"stoplist"
;
"
project stoplist (todo)
"
->
"stoplist"
;
"stoplist"
->
"mainlist"
;
"stoplist"
->
"mainlist"
;
"occs+t
fidfs"
->
"mainlist"
[
label
=
" TFIDF
_LIMIT"
]
;
"occs+t
i_rank"
->
"mainlist"
[
label
=
" TI_RANK
_LIMIT"
]
;
"mainlist"
->
"coocs"
[
label
=
" COOCS_THRESHOLD"
]
;
"mainlist"
->
"coocs"
[
label
=
" COOCS_THRESHOLD"
]
;
"coocs"
->
"specificity"
;
"coocs"
->
"specificity"
;
"specificity"
->
"maplist"
[
label
=
"MAPLIST_LIMIT\nMONOGRAM_PART"
]
;
"specificity"
->
"maplist"
[
label
=
"MAPLIST_LIMIT\nMONOGRAM_PART"
]
;
"mainlist"
->
"tfidf"
;
"tfidf"
->
"explore"
[
label
=
"doc relations with all map and candidates"
]
;
"maplist"
->
"explore"
;
"maplist"
->
"explore"
;
"grouplist"
->
"maplist"
;
"grouplist"
->
"occs+ti_rank"
;
"grouplist"
->
"coocs"
;
"grouplist"
->
"tfidf"
;
}
}
doc/schemas/ngram_parsing_flow.png
100755 → 100644
View replaced file @
6c438c85
View file @
75a7e329
52.5 KB
|
W:
|
H:
75.9 KB
|
W:
|
H:
2-up
Swipe
Onion skin
gargantext/util/lists.py
View file @
75a7e329
...
@@ -196,10 +196,10 @@ class WeightedMatrix(_BaseClass):
...
@@ -196,10 +196,10 @@ class WeightedMatrix(_BaseClass):
self
.
id
=
source
self
.
id
=
source
from
gargantext.models
import
NodeNgramNgram
from
gargantext.models
import
NodeNgramNgram
query
=
(
session
query
=
(
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
,
NodeNgramNgram
.
score
)
.
query
(
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
,
NodeNgramNgram
.
weight
)
.
filter
(
NodeNgramNgram
.
node_id
==
source
)
.
filter
(
NodeNgramNgram
.
node_id
==
source
)
)
)
for
key1
,
key2
,
value
in
self
.
items
.
items
():
for
key1
,
key2
,
value
in
query
.
all
():
self
.
items
[
key1
,
key2
]
=
value
self
.
items
[
key1
,
key2
]
=
value
elif
isinstance
(
source
,
WeightedMatrix
):
elif
isinstance
(
source
,
WeightedMatrix
):
for
key1
,
key2
,
value
in
source
:
for
key1
,
key2
,
value
in
source
:
...
@@ -225,11 +225,14 @@ class WeightedMatrix(_BaseClass):
...
@@ -225,11 +225,14 @@ class WeightedMatrix(_BaseClass):
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
node_id
)
.
delete
()
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
node_id
)
.
delete
()
session
.
commit
()
session
.
commit
()
# insert new data
# insert new data
print
(
"WeightedMatrix bulk_insert start"
)
bulk_insert
(
bulk_insert
(
NodeNgramNgram
,
NodeNgramNgram
,
(
'node_id'
,
'ngram1_id'
,
'ngram2_id'
,
'weight'
),
(
'node_id'
,
'ngram1_id'
,
'ngram2_id'
,
'weight'
),
((
node_id
,
key1
,
key2
,
value
)
for
key1
,
key2
,
value
in
self
)
((
node_id
,
key1
,
key2
,
value
)
for
key1
,
key2
,
value
in
self
)
)
)
print
(
"WeightedMatrix bulk_insert stop"
)
def
__radd__
(
self
,
other
):
def
__radd__
(
self
,
other
):
result
=
NotImplemented
result
=
NotImplemented
...
...
gargantext/util/toolchain/__init__.py
View file @
75a7e329
...
@@ -6,12 +6,12 @@ from .hyperdata_indexing import index_hyperdata
...
@@ -6,12 +6,12 @@ from .hyperdata_indexing import index_hyperdata
# in usual run order
# in usual run order
from
.list_stop
import
do_stoplist
from
.list_stop
import
do_stoplist
from
.ngram_groups
import
compute_groups
from
.metric_tfidf
import
compute_occs
,
compute_tfidf_local
,
compute_ti_ranking
from
.metric_tfidf
import
compute_occs
,
compute_tfidf_local
,
compute_ti_ranking
from
.list_main
import
do_mainlist
from
.list_main
import
do_mainlist
from
.ngram_coocs
import
compute_coocs
from
.ngram_coocs
import
compute_coocs
from
.metric_specificity
import
compute_specificity
from
.metric_specificity
import
compute_specificity
from
.list_map
import
do_maplist
# TEST
from
.list_map
import
do_maplist
# TEST
from
.ngram_groups
import
compute_groups
from
.mail_notification
import
notify_owner
from
.mail_notification
import
notify_owner
from
gargantext.util.db
import
session
from
gargantext.util.db
import
session
from
gargantext.models
import
Node
from
gargantext.models
import
Node
...
@@ -129,27 +129,31 @@ def parse_extract_indexhyperdata(corpus):
...
@@ -129,27 +129,31 @@ def parse_extract_indexhyperdata(corpus):
print
(
'CORPUS #
%
d: [
%
s] new mainlist node #
%
i'
%
(
corpus
.
id
,
t
(),
mainlist_id
))
print
(
'CORPUS #
%
d: [
%
s] new mainlist node #
%
i'
%
(
corpus
.
id
,
t
(),
mainlist_id
))
# -> write local tfidf similarities to Node and NodeNodeNgram
# -> write local tfidf similarities to Node and NodeNodeNgram
# TODO only on mainlist
ltfidf_id
=
compute_tfidf_local
(
corpus
,
ltfidf_id
=
compute_tfidf_local
(
corpus
)
on_list_id
=
mainlist_id
,
groupings_id
=
group_id
)
print
(
'CORPUS #
%
d: [
%
s] new localtfidf node #
%
i'
%
(
corpus
.
id
,
t
(),
ltfidf_id
))
print
(
'CORPUS #
%
d: [
%
s] new localtfidf node #
%
i'
%
(
corpus
.
id
,
t
(),
ltfidf_id
))
# => used for doc <=> ngram association
# => used for doc <=> ngram association
# ------------
# ------------
# -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
# -> cooccurrences on mainlist: compute + write (=> Node and NodeNgramNgram)
cooc_id
=
compute_coocs
(
corpus
,
mainlist_id
=
mainlist_id
,
groupings_id
=
group_id
)
coocs
=
compute_coocs
(
corpus
,
print
(
'CORPUS #
%
d: [
%
s] new coocs node #
%
i'
%
(
corpus
.
id
,
t
(),
cooc_id
))
on_list_id
=
mainlist_id
,
groupings_id
=
group_id
,
just_pass_result
=
True
)
print
(
'CORPUS #
%
d: [
%
s] computed mainlist coocs for specif rank'
%
(
corpus
.
id
,
t
()))
# -> specificity: compute + write (=> NodeNodeNgram)
# -> specificity: compute + write (=> NodeNodeNgram)
spec_id
=
compute_specificity
(
corpus
,
cooc_id
=
cooc_id
spec_id
=
compute_specificity
(
corpus
,
cooc_matrix
=
coocs
)
# ,groupings_id = group_id
# no need here for subforms because cooc already counted them in mainform
)
print
(
'CORPUS #
%
d: [
%
s] new specificity node #
%
i'
%
(
corpus
.
id
,
t
(),
spec_id
))
print
(
'CORPUS #
%
d: [
%
s] new specificity node #
%
i'
%
(
corpus
.
id
,
t
(),
spec_id
))
#
??
maplist: compute + write (to Node and NodeNgram)
# maplist: compute + write (to Node and NodeNgram)
map_id
=
do_maplist
(
corpus
,
map_id
=
do_maplist
(
corpus
,
mainlist_id
=
mainlist_id
,
mainlist_id
=
mainlist_id
,
specificity_id
=
spec_id
,
specificity_id
=
spec_id
,
grouplist_id
=
group_id
)
grouplist_id
=
group_id
)
print
(
'CORPUS #
%
d: [
%
s] new maplist node #
%
i'
%
(
corpus
.
id
,
t
(),
map_id
))
print
(
'CORPUS #
%
d: [
%
s] new maplist node #
%
i'
%
(
corpus
.
id
,
t
(),
map_id
))
print
(
'CORPUS #
%
d: [
%
s] FINISHED ngram lists computation'
%
(
corpus
.
id
,
t
()))
print
(
'CORPUS #
%
d: [
%
s] FINISHED ngram lists computation'
%
(
corpus
.
id
,
t
()))
...
@@ -160,7 +164,7 @@ def parse_extract_indexhyperdata(corpus):
...
@@ -160,7 +164,7 @@ def parse_extract_indexhyperdata(corpus):
if
DEBUG
is
False
:
if
DEBUG
is
False
:
print
(
'CORPUS #
%
d: [
%
s] FINISHED Sendin
d
email notification'
%
(
corpus
.
id
,
t
()))
print
(
'CORPUS #
%
d: [
%
s] FINISHED Sendin
g
email notification'
%
(
corpus
.
id
,
t
()))
notify_owner
(
corpus
)
notify_owner
(
corpus
)
corpus
.
status
(
'Workflow'
,
progress
=
10
,
complete
=
True
)
corpus
.
status
(
'Workflow'
,
progress
=
10
,
complete
=
True
)
...
...
gargantext/util/toolchain/list_map.py
View file @
75a7e329
...
@@ -43,15 +43,11 @@ def do_maplist(corpus,
...
@@ -43,15 +43,11 @@ def do_maplist(corpus,
#dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
#dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
mainterms_subquery
=
(
session
MainlistTable
=
aliased
(
NodeNgram
)
# we want only terms within mainlist
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
mainlist_id
)
.
subquery
()
)
primary_groupterms_subquery
=
(
session
IsSubform
=
(
session
# we want only primary terms (ngram1)
# we want only secondary terms (ngram2)
# to be able to filter them out
.
query
(
NodeNgramNgram
.
ngram2_id
)
.
query
(
NodeNgramNgram
.
ngram2_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
grouplist_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
grouplist_id
)
.
subquery
()
.
subquery
()
...
@@ -63,8 +59,15 @@ def do_maplist(corpus,
...
@@ -63,8 +59,15 @@ def do_maplist(corpus,
query
=
(
session
.
query
(
ScoreSpec
.
ngram_id
)
query
=
(
session
.
query
(
ScoreSpec
.
ngram_id
)
.
join
(
Ngram
,
Ngram
.
id
==
ScoreSpec
.
ngram_id
)
.
join
(
Ngram
,
Ngram
.
id
==
ScoreSpec
.
ngram_id
)
.
filter
(
ScoreSpec
.
node_id
==
specificity_id
)
.
filter
(
ScoreSpec
.
node_id
==
specificity_id
)
.
filter
(
ScoreSpec
.
ngram_id
.
in_
(
mainterms_subquery
))
.
filter
(
ScoreSpec
.
ngram_id
.
notin_
(
primary_groupterms_subquery
))
# we want only terms within mainlist
.
join
(
MainlistTable
,
Ngram
.
id
==
MainlistTable
.
ngram_id
)
.
filter
(
MainlistTable
.
node_id
==
mainlist_id
)
# we remove all ngrams matching an ngram2_id from the synonyms
.
outerjoin
(
IsSubform
,
IsSubform
.
c
.
ngram2_id
==
ScoreSpec
.
ngram_id
)
.
filter
(
IsSubform
.
c
.
ngram2_id
==
None
)
)
)
# TODO: move these 2 pools up to mainlist selection
# TODO: move these 2 pools up to mainlist selection
...
@@ -94,7 +97,7 @@ def do_maplist(corpus,
...
@@ -94,7 +97,7 @@ def do_maplist(corpus,
new_hyperdata
=
{
'corpus'
:
corpus
.
id
,
new_hyperdata
=
{
'corpus'
:
corpus
.
id
,
'limit'
:
limit
,
'limit'
:
limit
,
'monograms_part'
:
monograms_part
,
'monograms_part'
:
monograms_part
,
'monograms_result'
:
obtained_mono
/
obtained_total
if
obtained_total
!=
0
else
obtained_mono
'monograms_result'
:
obtained_mono
/
obtained_total
if
obtained_total
!=
0
else
0
}
}
if
overwrite_id
:
if
overwrite_id
:
# overwrite pre-existing node
# overwrite pre-existing node
...
...
gargantext/util/toolchain/metric_specificity.py
View file @
75a7e329
...
@@ -9,7 +9,7 @@ from collections import defaultdict
...
@@ -9,7 +9,7 @@ from collections import defaultdict
from
pandas
import
DataFrame
from
pandas
import
DataFrame
import
pandas
as
pd
import
pandas
as
pd
def
compute_specificity
(
corpus
,
cooc_id
=
None
,
overwrite_id
=
None
):
def
compute_specificity
(
corpus
,
cooc_id
=
None
,
cooc_matrix
=
None
,
overwrite_id
=
None
):
'''
'''
Compute the specificity, simple calculus.
Compute the specificity, simple calculus.
...
@@ -18,18 +18,26 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
...
@@ -18,18 +18,26 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
- overwrite_id: optional preexisting specificity node to overwrite
- overwrite_id: optional preexisting specificity node to overwrite
'''
'''
matrix
=
defaultdict
(
lambda
:
defaultdict
(
float
))
if
cooc_id
==
None
and
cooc_matrix
==
None
:
raise
TypeError
(
"compute_specificity: needs a cooc_id or cooc_matrix param"
)
elif
cooc_id
:
cooccurrences
=
(
session
.
query
(
NodeNgramNgram
)
cooccurrences
=
(
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
cooc_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
cooc_id
)
)
)
# no filtering: new choice cooc already filtered on tfidf before creation
# no filtering: cooc already filtered on mainlist_id at creation
matrix
=
defaultdict
(
lambda
:
defaultdict
(
float
))
# £TODO re-rename weight => score
for
cooccurrence
in
cooccurrences
:
for
cooccurrence
in
cooccurrences
:
matrix
[
cooccurrence
.
ngram1_id
][
cooccurrence
.
ngram2_id
]
=
cooccurrence
.
weight
matrix
[
cooccurrence
.
ngram1_id
][
cooccurrence
.
ngram2_id
]
=
cooccurrence
.
weight
matrix
[
cooccurrence
.
ngram2_id
][
cooccurrence
.
ngram1_id
]
=
cooccurrence
.
weight
matrix
[
cooccurrence
.
ngram2_id
][
cooccurrence
.
ngram1_id
]
=
cooccurrence
.
weight
elif
cooc_matrix
:
# copy WeightedMatrix into local matrix structure
for
(
ngram1_id
,
ngram2_id
)
in
cooc_matrix
.
items
:
w
=
cooc_matrix
.
items
[(
ngram1_id
,
ngram2_id
)]
matrix
[
ngram1_id
][
ngram2_id
]
=
w
nb_ngrams
=
len
(
matrix
)
nb_ngrams
=
len
(
matrix
)
print
(
"SPECIFICITY: computing on
%
i ngrams"
%
nb_ngrams
)
print
(
"SPECIFICITY: computing on
%
i ngrams"
%
nb_ngrams
)
...
...
gargantext/util/toolchain/metric_tfidf.py
View file @
75a7e329
...
@@ -377,12 +377,18 @@ def compute_ti_ranking(corpus,
...
@@ -377,12 +377,18 @@ def compute_ti_ranking(corpus,
def
compute_tfidf_local
(
corpus
,
overwrite_id
=
None
):
def
compute_tfidf_local
(
corpus
,
on_list_id
=
None
,
groupings_id
=
None
,
overwrite_id
=
None
):
"""
"""
Calculates tfidf similarity of each (doc, ngram) couple, within the current corpus
Calculates tfidf similarity of each (doc, ngram) couple, within the current corpus
Parameters:
Parameters:
- the corpus itself
- the corpus itself
- groupings_id: optional synonym relations to add all subform counts
with their mainform's counts
- on_list_id: mainlist or maplist type, to constrain the input ngrams
- overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
- overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
(the Node and its previous NodeNodeNgram rows will be replaced)
"""
"""
...
@@ -398,36 +404,94 @@ def compute_tfidf_local(corpus, overwrite_id=None):
...
@@ -398,36 +404,94 @@ def compute_tfidf_local(corpus, overwrite_id=None):
# N
# N
total_docs
=
session
.
query
(
docids_subquery
)
.
count
()
total_docs
=
session
.
query
(
docids_subquery
)
.
count
()
# number of docs with given term (number of rows = M ngrams)
n_docswith_ng
=
(
session
# define the counted form
.
query
(
if
not
groupings_id
:
NodeNgram
.
ngram_id
,
ngform_id
=
NodeNgram
.
ngram_id
func
.
count
(
NodeNgram
.
node_id
)
.
label
(
"nd"
)
# nd: n docs with term
else
:
)
Syno
=
(
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
.
filter
(
NodeNgram
.
node_id
.
in_
(
docids_subquery
)
)
NodeNgramNgram
.
ngram2_id
)
.
group_by
(
NodeNgram
.
ngram
_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
groupings
_id
)
.
all
()
.
subquery
()
)
)
# { ngram_id => log(nd) }
ngform_id
=
case
([
log_nd_lookup
=
{
row
.
ngram_id
:
log
(
row
.
nd
)
for
row
in
n_docswith_ng
}
(
Syno
.
c
.
ngram1_id
!=
None
,
Syno
.
c
.
ngram1_id
),
(
Syno
.
c
.
ngram1_id
==
None
,
NodeNgram
.
ngram_id
)
])
# tf for each couple (number of rows = N docs X M ngrams)
# tf for each couple (number of rows = N docs X M ngrams)
tf_doc_
ng
=
(
session
tf_doc_
query
=
(
session
.
query
(
.
query
(
NodeNgram
.
ngra
m_id
,
ngfor
m_id
,
NodeNgram
.
node_id
,
NodeNgram
.
node_id
,
func
.
sum
(
NodeNgram
.
weight
)
.
label
(
"tf"
),
# tf: occurrences
func
.
sum
(
NodeNgram
.
weight
)
.
label
(
"tf"
),
# tf: occurrences
)
)
.
filter
(
NodeNgram
.
node_id
.
in_
(
docids_subquery
))
.
group_by
(
NodeNgram
.
node_id
,
NodeNgram
.
ngram_id
)
# select within docs of current corpus
.
all
()
.
join
(
docids_subquery
,
docids_subquery
.
c
.
id
==
NodeNgram
.
node_id
)
)
if
groupings_id
:
tf_doc_query
=
(
tf_doc_query
.
outerjoin
(
Syno
,
Syno
.
c
.
ngram2_id
==
NodeNgram
.
ngram_id
)
)
# now when we'll group_by the ngram2 freqs will be added to ngram1
if
on_list_id
:
Miamlist
=
aliased
(
NodeNgram
)
tf_doc_query
=
(
tf_doc_query
.
join
(
Miamlist
,
Miamlist
.
ngram_id
==
ngform_id
)
.
filter
(
Miamlist
.
node_id
==
on_list_id
)
)
)
# execute query to do our tf sum
tf_per_doc
=
tf_doc_query
.
group_by
(
NodeNgram
.
node_id
,
ngform_id
)
.
all
()
# ex: [(128371, 9732, 1.0),
# (128383, 9740, 1.0),
# (128373, 9731, 1.0),
# (128376, 9734, 1.0),
# (128372, 9731, 1.0),
# (128383, 9733, 1.0),
# (128383, 9735, 1.0),
# (128389, 9734, 1.0),
# (8624, 9731, 1.0),
# (128382, 9740, 1.0),
# (128383, 9739, 1.0),
# (128383, 9736, 1.0),
# (128378, 9735, 1.0),
# (128375, 9733, 4.0),
# (128383, 9732, 1.0)]
# ^ ^ ^^ ^^
# ngram doc freq in this doc
# simultaneously count docs with given term (number of rows = M ngrams)
ndocswithngram
=
{}
for
triple
in
tf_per_doc
:
ng
=
triple
[
0
]
doc
=
triple
[
1
]
if
ng
in
ndocswithngram
:
ndocswithngram
[
ng
]
+=
1
else
:
ndocswithngram
[
ng
]
=
1
# print(ndocswithngram)
# store for use in formula
# { ngram_id => log(nd) }
log_nd_lookup
=
{
ng
:
log
(
nd_count
)
for
(
ng
,
nd_count
)
in
ndocswithngram
.
items
()}
# ---------------------------------------------------------
# ---------------------------------------------------------
tfidfs
=
{}
tfidfs
=
{}
log_tot_docs
=
log
(
total_docs
)
log_tot_docs
=
log
(
total_docs
)
for
(
ngram_id
,
node_id
,
tf
)
in
tf_
doc_ng
:
for
(
ngram_id
,
node_id
,
tf
)
in
tf_
per_doc
:
log_nd
=
log_nd_lookup
[
ngram_id
]
log_nd
=
log_nd_lookup
[
ngram_id
]
# tfidfs[ngram_id] = tf * log(total_docs/nd)
# tfidfs[ngram_id] = tf * log(total_docs/nd)
tfidfs
[
node_id
,
ngram_id
]
=
tf
*
(
log_tot_docs
-
log_nd
)
tfidfs
[
node_id
,
ngram_id
]
=
tf
*
(
log_tot_docs
-
log_nd
)
...
...
gargantext/util/toolchain/ngram_coocs.py
View file @
75a7e329
...
@@ -10,13 +10,15 @@ from sqlalchemy.sql.expression import case # for choice if ngram has mainform or
...
@@ -10,13 +10,15 @@ from sqlalchemy.sql.expression import case # for choice if ngram has mainform or
def
compute_coocs
(
corpus
,
def
compute_coocs
(
corpus
,
overwrite_id
=
None
,
overwrite_id
=
None
,
just_pass_result
=
True
,
# just return the WeightedMatrix,
# (don't write to DB)
threshold
=
DEFAULT_COOC_THRESHOLD
,
threshold
=
DEFAULT_COOC_THRESHOLD
,
groupings_id
=
None
,
groupings_id
=
None
,
mainlist_id
=
None
,
on_list_id
=
None
,
stoplist_id
=
None
,
stoplist_id
=
None
,
start
=
None
,
start
=
None
,
end
=
None
,
end
=
None
,
symmetry_filter
=
Tru
e
):
symmetry_filter
=
Fals
e
):
"""
"""
Count how often some extracted terms appear
Count how often some extracted terms appear
together in a small context (document)
together in a small context (document)
...
@@ -46,7 +48,7 @@ def compute_coocs( corpus,
...
@@ -46,7 +48,7 @@ def compute_coocs( corpus,
- threshold: on output cooc count (previously called hapax)
- threshold: on output cooc count (previously called hapax)
- groupings_id: optional synonym relations to add all subform counts
- groupings_id: optional synonym relations to add all subform counts
with their mainform's counts
with their mainform's counts
-
mainlist_id: mainlist
to constrain the input ngrams
-
on_list_id: mainlist or maplist type,
to constrain the input ngrams
- stoplist_id: stoplist for filtering input ngrams
- stoplist_id: stoplist for filtering input ngrams
(normally unnecessary if a mainlist is already provided)
(normally unnecessary if a mainlist is already provided)
- start, end: provide one or both temporal limits to filter on doc date
- start, end: provide one or both temporal limits to filter on doc date
...
@@ -62,9 +64,10 @@ def compute_coocs( corpus,
...
@@ -62,9 +64,10 @@ def compute_coocs( corpus,
======================
======================
each pair of ngrams sharing same doc (node_id)
each pair of ngrams sharing same doc (node_id)
SELEC idxa.ngram_id, idxb.ngram_id
SELEC idxa.ngram_id, idxb.ngram_id
FROM nodes_ngrams AS idxa
, nodes_ngrams AS idxb
FROM nodes_ngrams AS idxa
---------------------------------
---------------------------------
WHERE idxa.node_id = idxb.node_id <== that's cooc
JOIN nodes_ngrams AS idxb
ON idxa.node_id = idxb.node_id <== that's cooc
---------------------------------
---------------------------------
AND idxa.ngram_id <> idxb.ngram_id
AND idxa.ngram_id <> idxb.ngram_id
AND idxa.node_id = MY_DOC ;
AND idxa.node_id = MY_DOC ;
...
@@ -188,7 +191,7 @@ def compute_coocs( corpus,
...
@@ -188,7 +191,7 @@ def compute_coocs( corpus,
# 4) INPUT FILTERS (reduce N before O(N²))
# 4) INPUT FILTERS (reduce N before O(N²))
if
main
list_id
:
if
on_
list_id
:
m1
=
aliased
(
NodeNgram
)
m1
=
aliased
(
NodeNgram
)
m2
=
aliased
(
NodeNgram
)
m2
=
aliased
(
NodeNgram
)
...
@@ -197,8 +200,8 @@ def compute_coocs( corpus,
...
@@ -197,8 +200,8 @@ def compute_coocs( corpus,
.
join
(
m1
,
m1
.
ngram_id
==
Xindex_ngform_id
)
.
join
(
m1
,
m1
.
ngram_id
==
Xindex_ngform_id
)
.
join
(
m2
,
m2
.
ngram_id
==
Yindex_ngform_id
)
.
join
(
m2
,
m2
.
ngram_id
==
Yindex_ngform_id
)
.
filter
(
m1
.
node_id
==
main
list_id
)
.
filter
(
m1
.
node_id
==
on_
list_id
)
.
filter
(
m2
.
node_id
==
main
list_id
)
.
filter
(
m2
.
node_id
==
on_
list_id
)
)
)
if
stoplist_id
:
if
stoplist_id
:
...
@@ -279,11 +282,16 @@ def compute_coocs( corpus,
...
@@ -279,11 +282,16 @@ def compute_coocs( corpus,
shape_1
=
len
({
pair
[
1
]
for
pair
in
matrix
.
items
})
shape_1
=
len
({
pair
[
1
]
for
pair
in
matrix
.
items
})
print
(
"COOCS: NEW matrix shape [
%
ix
%
i]"
%
(
shape_0
,
shape_1
))
print
(
"COOCS: NEW matrix shape [
%
ix
%
i]"
%
(
shape_0
,
shape_1
))
if
just_pass_result
:
return
matrix
else
:
# 5) SAVE
# 5) SAVE
# --------
# --------
# saving the parameters of the analysis in the Node JSON
# saving the parameters of the analysis in the Node JSON
new_hyperdata
=
{
'corpus'
:
corpus
.
id
,
new_hyperdata
=
{
'corpus'
:
corpus
.
id
,
'threshold'
:
threshold
}
'threshold'
:
threshold
}
if
overwrite_id
:
if
overwrite_id
:
# overwrite pre-existing id
# overwrite pre-existing id
the_cooc
=
cache
.
Node
[
overwrite_id
]
the_cooc
=
cache
.
Node
[
overwrite_id
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment