Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
4bfc0b6c
Commit
4bfc0b6c
authored
Jul 08, 2016
by
delanoe
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'romain-goodies' into unstable
parents
1925c104
f542b69e
Changes
11
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
421 additions
and
205 deletions
+421
-205
constants.py
gargantext/constants.py
+14
-7
ngramlists_tools.py
gargantext/util/ngramlists_tools.py
+1
-1
ngramsextractors.py
gargantext/util/ngramsextractors.py
+1
-1
list_main.py
gargantext/util/toolchain/list_main.py
+2
-2
list_map.py
gargantext/util/toolchain/list_map.py
+119
-38
main.py
gargantext/util/toolchain/main.py
+24
-13
metric_specgen.py
gargantext/util/toolchain/metric_specgen.py
+233
-0
metric_specificity.py
gargantext/util/toolchain/metric_specificity.py
+0
-126
ngram_coocs.py
gargantext/util/toolchain/ngram_coocs.py
+19
-9
ngrams_extraction.py
gargantext/util/toolchain/ngrams_extraction.py
+7
-7
ngramlists.py
gargantext/views/api/ngramlists.py
+1
-1
No files found.
gargantext/constants.py
View file @
4bfc0b6c
...
...
@@ -12,14 +12,16 @@ LISTTYPES = {
'STOPLIST'
:
UnweightedList
,
'MAINLIST'
:
UnweightedList
,
'MAPLIST'
:
UnweightedList
,
'SPECIFICITY'
:
WeightedList
,
'SPECCLUSION'
:
WeightedList
,
'GENCLUSION'
:
WeightedList
,
'OCCURRENCES'
:
WeightedIndex
,
# could be WeightedList
'COOCCURRENCES'
:
WeightedMatrix
,
'TFIDF-CORPUS'
:
WeightedIndex
,
'TFIDF-GLOBAL'
:
WeightedIndex
,
'TIRANK-LOCAL'
:
WeightedIndex
,
# could be WeightedList
'TIRANK-GLOBAL'
:
WeightedIndex
# could be WeightedList
'TIRANK-GLOBAL'
:
WeightedIndex
,
# could be WeightedList
}
# 'OWNLIST' : UnweightedList, # £TODO use this for any term-level tags
NODETYPES
=
[
# TODO separate id not array index, read by models.node
...
...
@@ -37,7 +39,7 @@ NODETYPES = [
'COOCCURRENCES'
,
# 9
# scores
'OCCURRENCES'
,
# 10
'SPEC
IFICITY
'
,
# 11
'SPEC
CLUSION
'
,
# 11
'CVALUE'
,
# 12
'TFIDF-CORPUS'
,
# 13
'TFIDF-GLOBAL'
,
# 14
...
...
@@ -47,6 +49,7 @@ NODETYPES = [
# more scores (sorry!)
'TIRANK-LOCAL'
,
# 16
'TIRANK-GLOBAL'
,
# 17
'GENCLUSION'
,
# 18
]
INDEXED_HYPERDATA
=
{
...
...
@@ -222,12 +225,16 @@ DEFAULT_RANK_CUTOFF_RATIO = .75 # MAINLIST maximum terms in %
DEFAULT_RANK_HARD_LIMIT
=
5000
# MAINLIST maximum terms abs
# (makes COOCS larger ~ O(N²) /!\)
DEFAULT_COOC_THRESHOLD
=
2
# inclusive minimum for COOCS coefs
DEFAULT_COOC_THRESHOLD
=
3
# inclusive minimum for COOCS coefs
# (makes COOCS more sparse)
DEFAULT_MAPLIST_MAX
=
350
# MAPLIST maximum terms
DEFAULT_MAPLIST_MONOGRAMS_RATIO
=
.15
# part of monograms in MAPLIST
DEFAULT_MAPLIST_MONOGRAMS_RATIO
=
.2
# quota of monograms in MAPLIST
# (vs multigrams = 1-mono)
DEFAULT_MAPLIST_GENCLUSION_RATIO
=
.6
# quota of top genclusion in MAPLIST
# (vs top specclusion = 1-gen)
DEFAULT_MAX_NGRAM_LEN
=
7
# limit used after POStagging rule
# (initial ngrams number is a power law of this /!\)
...
...
@@ -272,7 +279,7 @@ DOWNLOAD_DIRECTORY = UPLOAD_DIRECTORY
# about batch processing...
BATCH_PARSING_SIZE
=
256
BATCH_NGRAMSEXTRACTION_SIZE
=
1024
BATCH_NGRAMSEXTRACTION_SIZE
=
3000
# how many distinct ngrams before INTEGRATE
# Scrapers config
...
...
@@ -282,7 +289,7 @@ QUERY_SIZE_N_DEFAULT = 1000
# Grammar rules for chunking
RULE_JJNN
=
"{<JJ.*>*<NN.*|>+<JJ.*>*}"
RULE_
JJDTNN
=
"{<JJ.*>*<NN.*>+((<P|IN> <DT>? <JJ.*>* <NN.*>+ <JJ.*>*)|(<JJ.*>))*}"
RULE_
NPN
=
"{<JJ.*>*<NN.*>+((<P|IN> <DT>? <JJ.*>* <NN.*>+ <JJ.*>*)|(<JJ.*>))*}"
RULE_TINA
=
"^((VBD,|VBG,|VBN,|CD.?,|JJ.?,|
\
?,){0,2}?(N.?.?,|
\
?,)+?(CD.,)??)
\
+?((PREP.?|DET.?,|IN.?,|CC.?,|
\
?,)((VBD,|VBG,|VBN,|CD.?,|JJ.?,|
\
?
\
,){0,2}?(N.?.?,|
\
?,)+?)+?)*?$"
gargantext/util/ngramlists_tools.py
View file @
4bfc0b6c
...
...
@@ -19,7 +19,7 @@ from gargantext.constants import DEFAULT_CSV_DELIM, DEFAULT_CSV_DELIM_GRO
# import will implement the same text cleaning procedures as toolchain
from
gargantext.util.toolchain.parsing
import
normalize_chars
from
gargantext.util.toolchain.ngrams_extraction
import
normalize_
te
rms
from
gargantext.util.toolchain.ngrams_extraction
import
normalize_
fo
rms
from
sqlalchemy.sql
import
exists
from
os
import
path
...
...
gargantext/util/ngramsextractors.py
View file @
4bfc0b6c
from
gargantext.util.languages
import
languages
from
gargantext.constants
import
LANGUAGES
,
DEFAULT_MAX_NGRAM_LEN
,
RULE_JJNN
,
RULE_
JJDTN
N
from
gargantext.constants
import
LANGUAGES
,
DEFAULT_MAX_NGRAM_LEN
,
RULE_JJNN
,
RULE_
NP
N
import
nltk
import
re
...
...
gargantext/util/toolchain/list_main.py
View file @
4bfc0b6c
...
...
@@ -39,11 +39,11 @@ def do_mainlist(corpus,
# retrieve helper nodes if not provided
if
not
ranking_scores_id
:
ranking_scores_id
=
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
typename
==
"T
FIDF
-GLOBAL"
,
Node
.
typename
==
"T
IRANK
-GLOBAL"
,
Node
.
parent_id
==
corpus
.
id
)
.
first
()
if
not
ranking_scores_id
:
raise
ValueError
(
"MAINLIST: T
FIDF
node needed for mainlist creation"
)
raise
ValueError
(
"MAINLIST: T
IRANK
node needed for mainlist creation"
)
if
not
stoplist_id
:
stoplist_id
=
session
.
query
(
Node
.
id
)
.
filter
(
...
...
gargantext/util/toolchain/list_map.py
View file @
4bfc0b6c
...
...
@@ -9,37 +9,49 @@ from gargantext.util.db_cache import cache
from
gargantext.util.lists
import
UnweightedList
from
sqlalchemy
import
desc
,
asc
from
gargantext.constants
import
DEFAULT_MAPLIST_MAX
,
\
DEFAULT_MAPLIST_GENCLUSION_RATIO
,
\
DEFAULT_MAPLIST_MONOGRAMS_RATIO
def
do_maplist
(
corpus
,
overwrite_id
=
None
,
mainlist_id
=
None
,
specificity_id
=
None
,
specclusion_id
=
None
,
genclusion_id
=
None
,
grouplist_id
=
None
,
limit
=
DEFAULT_MAPLIST_MAX
,
genclusion_part
=
DEFAULT_MAPLIST_GENCLUSION_RATIO
,
monograms_part
=
DEFAULT_MAPLIST_MONOGRAMS_RATIO
):
'''
According to
Specificities
and mainlist
According to
Genericity/Specificity
and mainlist
Parameters:
- mainlist_id (starting point, already cleaned of stoplist terms)
- specificity_id (ranking factor)
- specclusion_id (ngram inclusion by cooc specificity -- ranking factor)
- genclusion_id (ngram inclusion by cooc genericity -- ranking factor)
- grouplist_id (filtering grouped ones)
- overwrite_id: optional if preexisting MAPLIST node to overwrite
+
2 constant
s to modulate the terms choice
+
3 param
s to modulate the terms choice
- limit for the amount of picked terms
- monograms_part: a ratio of terms with only one lexical unit to keep
(multigrams quota = limit * (1-monograms_part))
- genclusion_part: a ratio of terms with only one lexical unit to keep
(speclusion quota = limit * (1-genclusion_part))
'''
if
not
(
mainlist_id
and
spec
ificity
_id
and
grouplist_id
):
raise
ValueError
(
"Please provide mainlist_id, spec
ificity
_id and grouplist_id"
)
if
not
(
mainlist_id
and
spec
clusion_id
and
genclusion
_id
and
grouplist_id
):
raise
ValueError
(
"Please provide mainlist_id, spec
clusion_id, genclusion
_id and grouplist_id"
)
monograms_limit
=
round
(
limit
*
monograms_part
)
multigrams_limit
=
limit
-
monograms_limit
print
(
"MAPLIST: monograms_limit ="
,
monograms_limit
)
print
(
"MAPLIST: multigrams_limit = "
,
multigrams_limit
)
quotas
=
{
'topgen'
:{},
'topspec'
:{}}
genclusion_limit
=
round
(
limit
*
genclusion_part
)
speclusion_limit
=
limit
-
genclusion_limit
quotas
[
'topgen'
][
'monograms'
]
=
round
(
genclusion_limit
*
monograms_part
)
quotas
[
'topgen'
][
'multigrams'
]
=
genclusion_limit
-
quotas
[
'topgen'
][
'monograms'
]
quotas
[
'topspec'
][
'monograms'
]
=
round
(
speclusion_limit
*
monograms_part
)
quotas
[
'topspec'
][
'multigrams'
]
=
speclusion_limit
-
quotas
[
'topspec'
][
'monograms'
]
print
(
"MAPLIST quotas:"
,
quotas
)
#dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
...
...
@@ -54,11 +66,19 @@ def do_maplist(corpus,
)
ScoreSpec
=
aliased
(
NodeNgram
)
# specificity-ranked
query
=
(
session
.
query
(
ScoreSpec
.
ngram_id
)
ScoreGen
=
aliased
(
NodeNgram
)
# ngram with both ranking factors spec and gen
query
=
(
session
.
query
(
ScoreSpec
.
ngram_id
,
ScoreSpec
.
weight
,
ScoreGen
.
weight
,
Ngram
.
n
)
.
join
(
Ngram
,
Ngram
.
id
==
ScoreSpec
.
ngram_id
)
.
filter
(
ScoreSpec
.
node_id
==
specificity_id
)
.
join
(
ScoreGen
,
ScoreGen
.
ngram_id
==
ScoreSpec
.
ngram_id
)
.
filter
(
ScoreSpec
.
node_id
==
specclusion_id
)
.
filter
(
ScoreGen
.
node_id
==
genclusion_id
)
# we want only terms within mainlist
.
join
(
MainlistTable
,
Ngram
.
id
==
MainlistTable
.
ngram_id
)
...
...
@@ -68,36 +88,99 @@ def do_maplist(corpus,
.
outerjoin
(
IsSubform
,
IsSubform
.
c
.
ngram2_id
==
ScoreSpec
.
ngram_id
)
.
filter
(
IsSubform
.
c
.
ngram2_id
==
None
)
)
# TODO: move these 2 pools up to mainlist selection
top_monograms
=
(
query
.
filter
(
Ngram
.
n
==
1
)
.
order_by
(
asc
(
ScoreSpec
.
weight
))
.
limit
(
monograms_limit
)
.
all
()
)
top_multigrams
=
(
query
.
filter
(
Ngram
.
n
>=
2
)
# specificity-ranked
.
order_by
(
desc
(
ScoreSpec
.
weight
))
.
limit
(
multigrams_limit
)
.
all
()
)
obtained_mono
=
len
(
top_monograms
)
obtained_multi
=
len
(
top_multigrams
)
obtained_total
=
obtained_mono
+
obtained_multi
# print("MAPLIST: top_monograms =", obtained_mono)
# print("MAPLIST: top_multigrams = ", obtained_multi)
# format in scored_ngrams array:
# -------------------------------
# [(37723, 8.428, 14.239, 3 ), etc]
# ngramid wspec wgen nwords
scored_ngrams
=
query
.
all
()
n_ngrams
=
len
(
scored_ngrams
)
if
n_ngrams
==
0
:
raise
ValueError
(
"No ngrams in cooc table ?"
)
# results, with same structure as quotas
chosen_ngrams
=
{
'topgen'
:{
'monograms'
:[],
'multigrams'
:[]},
'topspec'
:{
'monograms'
:[],
'multigrams'
:[]}
}
# specificity and genericity are rather reverse-correlated
# but occasionally they can have common ngrams (same ngram well ranked in both)
# => we'll use a lookup table to check if we didn't already get it
already_gotten_ngramids
=
{}
# 2 loops to fill spec-clusion then gen-clusion quotas
# (1st loop uses order from DB, 2nd loop uses our own sort at end of 1st)
for
rkr
in
[
'topspec'
,
'topgen'
]:
got_enough_mono
=
False
got_enough_multi
=
False
all_done
=
False
i
=
-
1
while
((
not
all_done
)
and
(
not
(
got_enough_mono
and
got_enough_multi
))):
# retrieve sorted ngram n° i
i
+=
1
(
ng_id
,
wspec
,
wgen
,
nwords
)
=
scored_ngrams
[
i
]
# before any continue case, we check the next i for max reached
all_done
=
(
i
+
1
>=
n_ngrams
)
if
ng_id
in
already_gotten_ngramids
:
continue
# NB: nwords could be replaced by a simple search on r' '
if
nwords
==
1
:
if
got_enough_mono
:
continue
else
:
# add ngram to results and lookup
chosen_ngrams
[
rkr
][
'monograms'
]
.
append
(
ng_id
)
already_gotten_ngramids
[
ng_id
]
=
True
# multi
else
:
if
got_enough_multi
:
continue
else
:
# add ngram to results and lookup
chosen_ngrams
[
rkr
][
'multigrams'
]
.
append
(
ng_id
)
already_gotten_ngramids
[
ng_id
]
=
True
got_enough_mono
=
(
len
(
chosen_ngrams
[
rkr
][
'monograms'
])
>=
quotas
[
rkr
][
'monograms'
])
got_enough_multi
=
(
len
(
chosen_ngrams
[
rkr
][
'multigrams'
])
>=
quotas
[
rkr
][
'multigrams'
])
# at the end of the first loop we just need to sort all by the second ranker (gen)
scored_ngrams
=
sorted
(
scored_ngrams
,
key
=
lambda
ng_infos
:
ng_infos
[
2
],
reverse
=
True
)
obtained_spec_mono
=
len
(
chosen_ngrams
[
'topspec'
][
'monograms'
])
obtained_spec_multi
=
len
(
chosen_ngrams
[
'topspec'
][
'multigrams'
])
obtained_gen_mono
=
len
(
chosen_ngrams
[
'topgen'
][
'monograms'
])
obtained_gen_multi
=
len
(
chosen_ngrams
[
'topgen'
][
'multigrams'
])
obtained_total
=
obtained_spec_mono
\
+
obtained_spec_multi
\
+
obtained_gen_mono
\
+
obtained_gen_multi
print
(
"MAPLIST: top_spec_monograms ="
,
obtained_spec_mono
)
print
(
"MAPLIST: top_spec_multigrams ="
,
obtained_spec_multi
)
print
(
"MAPLIST: top_gen_monograms ="
,
obtained_gen_mono
)
print
(
"MAPLIST: top_gen_multigrams ="
,
obtained_gen_multi
)
print
(
"MAPLIST: kept
%
i ngrams in total "
%
obtained_total
)
obtained_data
=
chosen_ngrams
[
'topspec'
][
'monograms'
]
\
+
chosen_ngrams
[
'topspec'
][
'multigrams'
]
\
+
chosen_ngrams
[
'topgen'
][
'monograms'
]
\
+
chosen_ngrams
[
'topgen'
][
'multigrams'
]
# NEW MAPLIST NODE
# -----------------
# saving the parameters of the analysis in the Node JSON
new_hyperdata
=
{
'corpus'
:
corpus
.
id
,
'limit'
:
limit
,
'monograms_part'
:
monograms_part
,
'monograms_result'
:
obtained_mono
/
obtained_total
if
obtained_total
!=
0
else
0
'genclusion_part'
:
genclusion_part
,
}
if
overwrite_id
:
# overwrite pre-existing node
...
...
@@ -118,9 +201,7 @@ def do_maplist(corpus,
the_id
=
the_maplist
.
id
# create UnweightedList object and save (=> new NodeNgram rows)
datalist
=
UnweightedList
(
[
res
.
ngram_id
for
res
in
top_monograms
+
top_multigrams
]
)
datalist
=
UnweightedList
(
obtained_data
)
# save
datalist
.
save
(
the_id
)
...
...
gargantext/util/toolchain/main.py
View file @
4bfc0b6c
...
...
@@ -10,8 +10,8 @@ from .ngram_groups import compute_groups
from
.metric_tfidf
import
compute_occs
,
compute_tfidf_local
,
compute_ti_ranking
from
.list_main
import
do_mainlist
from
.ngram_coocs
import
compute_coocs
from
.metric_spec
ificity
import
compute_specificity
from
.list_map
import
do_maplist
# TEST
from
.metric_spec
gen
import
compute_specgen
from
.list_map
import
do_maplist
from
.mail_notification
import
notify_owner
from
gargantext.util.db
import
session
from
gargantext.models
import
Node
...
...
@@ -136,22 +136,26 @@ def parse_extract_indexhyperdata(corpus):
# => used for doc <=> ngram association
# ------------
# -> cooccurrences on mainlist: compute + write (=> Node and NodeNgramNgram)
# -> cooccurrences on mainlist: compute + write (=> Node and NodeNgramNgram)
*
coocs
=
compute_coocs
(
corpus
,
on_list_id
=
mainlist_id
,
groupings_id
=
group_id
,
just_pass_result
=
True
)
just_pass_result
=
True
,
diagonal_filter
=
False
)
# preserving the diagonal
# (useful for spec/gen)
print
(
'CORPUS #
%
d: [
%
s] computed mainlist coocs for specif rank'
%
(
corpus
.
id
,
t
()))
# -> spec
ificity: compute + write (=> Node
NodeNgram)
spec_id
=
compute_specificity
(
corpus
,
cooc_matrix
=
coocs
)
# -> spec
clusion/genclusion: compute + write (2 Nodes + 2 lists in
NodeNgram)
(
spec_id
,
gen_id
)
=
compute_specgen
(
corpus
,
cooc_matrix
=
coocs
)
# no need here for subforms because cooc already counted them in mainform
print
(
'CORPUS #
%
d: [
%
s] new specificity node #
%
i'
%
(
corpus
.
id
,
t
(),
spec_id
))
print
(
'CORPUS #
%
d: [
%
s] new spec-clusion node #
%
i'
%
(
corpus
.
id
,
t
(),
spec_id
))
print
(
'CORPUS #
%
d: [
%
s] new gen-clusion node #
%
i'
%
(
corpus
.
id
,
t
(),
gen_id
))
# maplist: compute + write (to Node and NodeNgram)
map_id
=
do_maplist
(
corpus
,
mainlist_id
=
mainlist_id
,
specificity_id
=
spec_id
,
specclusion_id
=
spec_id
,
genclusion_id
=
gen_id
,
grouplist_id
=
group_id
)
print
(
'CORPUS #
%
d: [
%
s] new maplist node #
%
i'
%
(
corpus
.
id
,
t
(),
map_id
))
...
...
@@ -187,7 +191,7 @@ def recount(corpus):
- ndocs
- ti_rank
- coocs
- spec
ificity
- spec
clusion/genclusion
- tfidf
NB: no new extraction, no list change, just the metrics
...
...
@@ -208,10 +212,15 @@ def recount(corpus):
old_tirank_id
=
None
try
:
old_spec_id
=
corpus
.
children
(
"SPEC
IFICITY
"
)
.
first
()
.
id
old_spec_id
=
corpus
.
children
(
"SPEC
CLUSION
"
)
.
first
()
.
id
except
:
old_spec_id
=
None
try
:
old_gen_id
=
corpus
.
children
(
"GENCLUSION"
)
.
first
()
.
id
except
:
old_gen_id
=
None
try
:
old_ltfidf_id
=
corpus
.
children
(
"TFIDF-CORPUS"
)
.
first
()
.
id
except
:
...
...
@@ -254,11 +263,13 @@ def recount(corpus):
just_pass_result
=
True
)
print
(
'RECOUNT #
%
d: [
%
s] updated mainlist coocs for specif rank'
%
(
corpus
.
id
,
t
()))
# -> specificity: compute + write (=> NodeNgram)
spec_id
=
compute_specificity
(
corpus
,
cooc_matrix
=
coocs
,
overwrite_id
=
old_spec_id
)
# -> specclusion/genclusion: compute + write (=> NodeNodeNgram)
(
spec_id
,
gen_id
)
=
compute_specgen
(
corpus
,
cooc_matrix
=
coocs
,
spec_overwrite_id
=
spec_id
,
gen_overwrite_id
=
gen_id
)
print
(
'RECOUNT #
%
d: [
%
s] updated specificity node #
%
i'
%
(
corpus
.
id
,
t
(),
spec_id
))
print
(
'RECOUNT #
%
d: [
%
s] updated spec-clusion node #
%
i'
%
(
corpus
.
id
,
t
(),
spec_id
))
print
(
'RECOUNT #
%
d: [
%
s] updated gen-clusion node #
%
i'
%
(
corpus
.
id
,
t
(),
gen_id
))
print
(
'RECOUNT #
%
d: [
%
s] FINISHED metric recounts'
%
(
corpus
.
id
,
t
()))
...
...
gargantext/util/toolchain/metric_specgen.py
0 → 100644
View file @
4bfc0b6c
"""
Computes a specificity metric from the ngram cooccurrence matrix.
+ SAVE => WeightedList => NodeNgram
"""
from
gargantext.models
import
Node
,
Ngram
,
NodeNgram
,
NodeNgramNgram
from
gargantext.util.db
import
session
,
aliased
,
func
,
bulk_insert
from
gargantext.util.lists
import
WeightedList
from
collections
import
defaultdict
from
pandas
import
DataFrame
from
numpy
import
diag
def
round3
(
floating_number
):
"""
Rounds a floating number to 3 decimals
Good when we don't need so much details in the DB writen data
"""
return
float
(
"
%.3
f"
%
floating_number
)
def
compute_specgen
(
corpus
,
cooc_id
=
None
,
cooc_matrix
=
None
,
spec_overwrite_id
=
None
,
gen_overwrite_id
=
None
):
'''
Compute genericity/specificity:
P(j|i) = N(ij) / N(ii)
P(i|j) = N(ij) / N(jj)
Gen(i) = Sum{j} P(j_k|i)
Spec(i) = Sum{j} P(i|j_k)
Gen-clusion(i) = (Spec(i) + Gen(i)) / 2
Spec-clusion(i) = (Spec(i) - Gen(i)) / 2
Parameters:
- cooc_id: mandatory id of a cooccurrences node to use as base
- spec_overwrite_id: optional preexisting specificity node to overwrite
- gen_overwrite_id: optional preexisting genericity node to overwrite
'''
matrix
=
defaultdict
(
lambda
:
defaultdict
(
float
))
if
cooc_id
==
None
and
cooc_matrix
==
None
:
raise
TypeError
(
"compute_specificity: needs a cooc_id or cooc_matrix param"
)
elif
cooc_id
:
cooccurrences
=
(
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
cooc_id
)
)
# no filtering: cooc already filtered on mainlist_id at creation
for
cooccurrence
in
cooccurrences
:
matrix
[
cooccurrence
.
ngram1_id
][
cooccurrence
.
ngram2_id
]
=
cooccurrence
.
weight
# matrix[cooccurrence.ngram2_id][cooccurrence.ngram1_id] = cooccurrence.weight
elif
cooc_matrix
:
# copy WeightedMatrix into local matrix structure
for
(
ngram1_id
,
ngram2_id
)
in
cooc_matrix
.
items
:
w
=
cooc_matrix
.
items
[(
ngram1_id
,
ngram2_id
)]
# ------- 8< --------------------------------------------
# tempo hack to ignore lines/columns where diagonal == 0
# £TODO find why they exist and then remove this snippet
if
(((
ngram1_id
,
ngram1_id
)
not
in
cooc_matrix
.
items
)
or
((
ngram2_id
,
ngram2_id
)
not
in
cooc_matrix
.
items
)):
continue
# ------- 8< --------------------------------------------
matrix
[
ngram1_id
][
ngram2_id
]
=
w
nb_ngrams
=
len
(
matrix
)
print
(
"SPECIFICITY: computing on
%
i ngrams"
%
nb_ngrams
)
# example corpus (7 docs, 8 nouns)
# --------------------------------
# "The report says that humans are animals."
# "The report says that rivers are full of water."
# "The report says that humans like to make war."
# "The report says that animals must eat food."
# "The report says that animals drink water."
# "The report says that humans like food and water."
# "The report says that grass is food for some animals."
#===========================================================================
cooc_counts
=
DataFrame
(
matrix
)
.
fillna
(
0
)
# cooc_counts matrix
# ------------------
# animals food grass humans report rivers war water
# animals 4 2 1 1 4 0 0 1
# food 2 3 1 1 3 0 0 1
# grass 1 1 1 0 1 0 0 0
# humans 1 1 0 3 3 0 1 1
# report 4 3 1 3 7 1 1 3
# rivers 0 0 0 0 1 1 0 1
# war 0 0 0 1 1 0 1 0
# water 1 1 0 1 3 1 0 3
#===========================================================================
# conditional p(col|line)
diagonal
=
list
(
diag
(
cooc_counts
))
# debug
# print("WARN diag: ", diagonal)
# print("WARN diag: =================== 0 in diagonal ?\n",
# 0 in diagonal ? "what ??? zeros in the diagonal :/" : "ok no zeros",
# "\n===================")
p_col_given_line
=
cooc_counts
/
list
(
diag
(
cooc_counts
))
# p_col_given_line
# ----------------
# animals food grass humans report rivers war water
# animals 1.0 0.7 1.0 0.3 0.6 0.0 0.0 0.3
# food 0.5 1.0 1.0 0.3 0.4 0.0 0.0 0.3
# grass 0.2 0.3 1.0 0.0 0.1 0.0 0.0 0.0
# humans 0.2 0.3 0.0 1.0 0.4 0.0 1.0 0.3
# report 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
# rivers 0.0 0.0 0.0 0.0 0.1 1.0 0.0 0.3
# war 0.0 0.0 0.0 0.3 0.1 0.0 1.0 0.0
# water 0.2 0.3 0.0 0.3 0.4 1.0 0.0 1.0
#===========================================================================
# total per lines (<=> genericity)
Gen
=
p_col_given_line
.
sum
(
axis
=
1
)
# Gen.sort_values(ascending=False)
# ---
# report 8.0
# animals 3.9
# food 3.6
# water 3.3
# humans 3.3
# grass 1.7
# war 1.5
# rivers 1.5
#===========================================================================
# total columnwise (<=> specificity)
Spec
=
p_col_given_line
.
sum
(
axis
=
0
)
# Spec.sort_values(ascending=False)
# ----
# grass 4.0
# food 3.7
# water 3.3
# humans 3.3
# report 3.3
# animals 3.2
# war 3.0
# rivers 3.0
#===========================================================================
# our "inclusion by specificity" metric
Specclusion
=
Spec
-
Gen
# Specclusion.sort_values(ascending=False)
# -----------
# grass 1.1
# war 0.8
# rivers 0.8
# food 0.0
# humans -0.0
# water -0.0
# animals -0.3
# report -2.4
#===========================================================================
# our "inclusion by genericity" metric
Genclusion
=
Spec
+
Gen
# Genclusion.sort_values(ascending=False)
# -----------
# report 11.3
# food 7.3
# animals 7.2
# water 6.7
# humans 6.7
# grass 5.7
# war 4.5
# rivers 4.5
#===========================================================================
# specificity node
if
spec_overwrite_id
:
# overwrite pre-existing id
the_spec_id
=
spec_overwrite_id
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
the_spec_id
)
.
delete
()
session
.
commit
()
else
:
specnode
=
corpus
.
add_child
(
typename
=
"SPECCLUSION"
,
name
=
"Specclusion (in:
%
s)"
%
corpus
.
id
)
session
.
add
(
specnode
)
session
.
commit
()
the_spec_id
=
specnode
.
id
if
not
Specclusion
.
empty
:
data
=
WeightedList
(
zip
(
Specclusion
.
index
.
tolist
()
,
[
v
for
v
in
map
(
round3
,
Specclusion
.
values
.
tolist
())]
)
)
data
.
save
(
the_spec_id
)
else
:
print
(
"WARNING: had no terms in COOCS => empty SPECCLUSION node"
)
#===========================================================================
# genclusion node
if
gen_overwrite_id
:
the_gen_id
=
gen_overwrite_id
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
the_gen_id
)
.
delete
()
session
.
commit
()
else
:
gennode
=
corpus
.
add_child
(
typename
=
"GENCLUSION"
,
name
=
"Genclusion (in:
%
s)"
%
corpus
.
id
)
session
.
add
(
gennode
)
session
.
commit
()
the_gen_id
=
gennode
.
id
if
not
Genclusion
.
empty
:
data
=
WeightedList
(
zip
(
Genclusion
.
index
.
tolist
()
,
[
v
for
v
in
map
(
round3
,
Genclusion
.
values
.
tolist
())]
)
)
data
.
save
(
the_gen_id
)
else
:
print
(
"WARNING: had no terms in COOCS => empty GENCLUSION node"
)
#===========================================================================
return
(
the_spec_id
,
the_gen_id
)
gargantext/util/toolchain/metric_specificity.py
deleted
100644 → 0
View file @
1925c104
"""
Computes a specificity metric from the ngram cooccurrence matrix.
+ SAVE => WeightedList => NodeNgram
"""
from
gargantext.models
import
Node
,
Ngram
,
NodeNgram
,
NodeNgramNgram
from
gargantext.util.db
import
session
,
aliased
,
func
,
bulk_insert
from
gargantext.util.lists
import
WeightedList
from
collections
import
defaultdict
from
pandas
import
DataFrame
import
pandas
as
pd
def
compute_specificity
(
corpus
,
cooc_id
=
None
,
cooc_matrix
=
None
,
overwrite_id
=
None
):
'''
Compute the specificity, simple calculus.
Parameters:
- cooc_id: mandatory id of a cooccurrences node to use as base
- overwrite_id: optional preexisting specificity node to overwrite
'''
matrix
=
defaultdict
(
lambda
:
defaultdict
(
float
))
if
cooc_id
==
None
and
cooc_matrix
==
None
:
raise
TypeError
(
"compute_specificity: needs a cooc_id or cooc_matrix param"
)
elif
cooc_id
:
cooccurrences
=
(
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
cooc_id
)
)
# no filtering: cooc already filtered on mainlist_id at creation
for
cooccurrence
in
cooccurrences
:
matrix
[
cooccurrence
.
ngram1_id
][
cooccurrence
.
ngram2_id
]
=
cooccurrence
.
weight
matrix
[
cooccurrence
.
ngram2_id
][
cooccurrence
.
ngram1_id
]
=
cooccurrence
.
weight
elif
cooc_matrix
:
# copy WeightedMatrix into local matrix structure
for
(
ngram1_id
,
ngram2_id
)
in
cooc_matrix
.
items
:
w
=
cooc_matrix
.
items
[(
ngram1_id
,
ngram2_id
)]
matrix
[
ngram1_id
][
ngram2_id
]
=
w
nb_ngrams
=
len
(
matrix
)
print
(
"SPECIFICITY: computing on
%
i ngrams"
%
nb_ngrams
)
x
=
DataFrame
(
matrix
)
.
fillna
(
0
)
# proba (x/y) ( <= on divise chaque ligne par son total)
x
=
x
/
x
.
sum
(
axis
=
1
)
# vectorisation
# d:Matrix => v: Vector (len = nb_ngrams)
# v = d.sum(axis=1) (- lui-même)
xs
=
x
.
sum
(
axis
=
1
)
-
x
ys
=
x
.
sum
(
axis
=
0
)
-
x
# top inclus ou exclus
#n = ( xs + ys) / (2 * (x.shape[0] - 1))
# top generic or specific (asc is spec, desc is generic)
v
=
(
xs
-
ys
)
/
(
2
*
(
x
.
shape
[
0
]
-
1
))
## d ##
#######
# Grenelle biodiversité kilomètres site élus île
# Grenelle 0 0 4 0 0 0
# biodiversité 0 0 0 0 4 0
# kilomètres 4 0 0 0 4 0
# site 0 0 0 0 4 6
# élus 0 4 4 4 0 0
# île 0 0 0 6 0 0
## d.sum(axis=1) ##
###################
# Grenelle 4
# biodiversité 4
# kilomètres 8
# site 10
# élus 12
# île 6
# résultat temporaire
# -------------------
# pour l'instant on va utiliser les sommes en ligne comme ranking de spécificité
# (**même** ordre qu'avec la formule d'avant le refactoring mais calcul + simple)
# TODO analyser la cohérence math ET sem de cet indicateur
#v.sort_values(inplace=True)
# [ ('biodiversité' , 0.333 ),
# ('Grenelle' , 0.5 ),
# ('île' , 0.599 ),
# ('kilomètres' , 1.333 ),
# ('site' , 1.333 ),
# ('élus' , 1.899 ) ]
# ----------------
# specificity node
if
overwrite_id
:
# overwrite pre-existing id
the_id
=
overwrite_id
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
the_id
)
.
delete
()
session
.
commit
()
else
:
specnode
=
corpus
.
add_child
(
typename
=
"SPECIFICITY"
,
name
=
"Specif (in:
%
s)"
%
corpus
.
id
)
session
.
add
(
specnode
)
session
.
commit
()
the_id
=
specnode
.
id
# print(v)
pd
.
options
.
display
.
float_format
=
'${:,.2f}'
.
format
if
not
v
.
empty
:
data
=
WeightedList
(
zip
(
v
.
index
.
tolist
()
,
v
.
values
.
tolist
()[
0
]
)
)
data
.
save
(
the_id
)
else
:
print
(
"WARNING: had no terms in COOCS => empty SPECIFICITY node"
)
return
(
the_id
)
gargantext/util/toolchain/ngram_coocs.py
View file @
4bfc0b6c
...
...
@@ -18,7 +18,8 @@ def compute_coocs( corpus,
stoplist_id
=
None
,
start
=
None
,
end
=
None
,
symmetry_filter
=
False
):
symmetry_filter
=
False
,
diagonal_filter
=
True
):
"""
Count how often some extracted terms appear
together in a small context (document)
...
...
@@ -55,6 +56,9 @@ def compute_coocs( corpus,
NB the expected type of parameter value is datetime.datetime
(string is also possible but format must follow
this convention: "2001-01-01" aka "
%
Y-
%
m-
%
d")
- symmetry_filter: prevent calculating where ngram1_id > ngram2_id
- diagonal_filter: prevent calculating where ngram1_id == ngram2_id
(deprecated parameters)
- field1,2: allowed to count other things than ngrams (eg tags) but no use case at present
...
...
@@ -69,7 +73,7 @@ def compute_coocs( corpus,
JOIN nodes_ngrams AS idxb
ON idxa.node_id = idxb.node_id <== that's cooc
---------------------------------
AND idxa.ngram_id <> idxb.ngram_id
AND idxa.ngram_id <> idxb.ngram_id
(diagonal_filter)
AND idxa.node_id = MY_DOC ;
on entire corpus
...
...
@@ -152,16 +156,14 @@ def compute_coocs( corpus,
ucooc
# for debug (2/4)
#, Xngram.terms.label("w_x")
#, Yngram.terms.label("w_y")
#
, Xngram.terms.label("w_x")
#
, Yngram.terms.label("w_y")
)
.
join
(
Yindex
,
Xindex
.
node_id
==
Yindex
.
node_id
)
# <- by definition of cooc
.
join
(
Node
,
Node
.
id
==
Xindex
.
node_id
)
# <- b/c within corpus
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
# <- b/c within corpus
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
# <- b/c within corpus
.
filter
(
Xindex_ngform_id
!=
Yindex_ngform_id
)
# <- b/c not with itself
)
# outerjoin the synonyms if needed
...
...
@@ -179,12 +181,12 @@ def compute_coocs( corpus,
.
group_by
(
Xindex_ngform_id
,
Yindex_ngform_id
# <- what we're counting
# for debug (3/4)
#,"w_x", "w_y"
#
,"w_x", "w_y"
)
# for debug (4/4)
#.join(Xngram, Xngram.id == Xindex_ngform_id)
#.join(Yngram, Yngram.id == Yindex_ngform_id)
#
.join(Xngram, Xngram.id == Xindex_ngform_id)
#
.join(Yngram, Yngram.id == Yindex_ngform_id)
.
order_by
(
ucooc
)
)
...
...
@@ -192,6 +194,9 @@ def compute_coocs( corpus,
# 4) INPUT FILTERS (reduce N before O(N²))
if
on_list_id
:
# £TODO listes différentes ou bien une liste pour x et tous les ngrammes pour y
# car permettrait expansion de liste aux plus proches voisins (MacLachlan)
# (avec une matr rectangulaire)
m1
=
aliased
(
NodeNgram
)
m2
=
aliased
(
NodeNgram
)
...
...
@@ -226,6 +231,10 @@ def compute_coocs( corpus,
)
if
diagonal_filter
:
# don't compute ngram with itself
coocs_query
=
coocs_query
.
filter
(
Xindex_ngform_id
!=
Yindex_ngform_id
)
if
start
or
end
:
Time
=
aliased
(
NodeHyperdata
)
...
...
@@ -268,6 +277,7 @@ def compute_coocs( corpus,
# threshold
# £TODO adjust COOC_THRESHOLD a posteriori:
# ex: sometimes 2 sometimes 4 depending on sparsity
print
(
"COOCS: filtering pairs under threshold:"
,
threshold
)
coocs_query
=
coocs_query
.
having
(
ucooc
>=
threshold
)
...
...
gargantext/util/toolchain/ngrams_extraction.py
View file @
4bfc0b6c
...
...
@@ -77,7 +77,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
continue
# get ngrams
for
ngram
in
ngramsextractor
.
extract
(
value
):
tokens
=
tuple
(
token
[
0
]
for
token
in
ngram
)
tokens
=
tuple
(
normalize_forms
(
token
[
0
])
for
token
in
ngram
)
if
do_subngrams
:
# ex tokens = ["very", "cool", "exemple"]
...
...
@@ -90,7 +90,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
subterms
=
[
tokens
]
for
seqterm
in
subterms
:
ngram
=
normalize_terms
(
' '
.
join
(
seqterm
)
)
ngram
=
' '
.
join
(
seqterm
)
if
len
(
ngram
)
>
1
:
# doc <=> ngram index
nodes_ngrams_count
[(
document
.
id
,
ngram
)]
+=
1
...
...
@@ -118,7 +118,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
raise
error
def
normalize_
te
rms
(
term_str
,
do_lowercase
=
DEFAULT_ALL_LOWERCASE_FLAG
):
def
normalize_
fo
rms
(
term_str
,
do_lowercase
=
DEFAULT_ALL_LOWERCASE_FLAG
):
"""
Removes unwanted trailing punctuation
AND optionally puts everything to lowercase
...
...
@@ -127,14 +127,14 @@ def normalize_terms(term_str, do_lowercase=DEFAULT_ALL_LOWERCASE_FLAG):
(benefits from normalize_chars upstream so there's less cases to consider)
"""
# print('normalize_
te
rms IN: "%s"' % term_str)
term_str
=
sub
(
r'^[-
",;/
%
(){}\\\[\]\.\'
]+'
,
''
,
term_str
)
term_str
=
sub
(
r'[-
",;/
%
(){}\\\[\]\.\'
]+$'
,
''
,
term_str
)
# print('normalize_
fo
rms IN: "%s"' % term_str)
term_str
=
sub
(
r'^[-
\'",;/
%
(){}\\\[\]\. ©
]+'
,
''
,
term_str
)
term_str
=
sub
(
r'[-
\'",;/
%
(){}\\\[\]\. ©
]+$'
,
''
,
term_str
)
if
do_lowercase
:
term_str
=
term_str
.
lower
()
# print('normalize_
te
rms OUT: "%s"' % term_str)
# print('normalize_
fo
rms OUT: "%s"' % term_str)
return
term_str
...
...
gargantext/views/api/ngramlists.py
View file @
4bfc0b6c
...
...
@@ -57,7 +57,7 @@ class CSVLists(APIView):
params in request.GET:
onto_corpus: the corpus whose lists are getting patched
params in request.
FILES
:
params in request.
data
:
csvfile: the csv file
/!
\
We assume we checked the file size client-side before upload
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment