Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
d673a1ec
Commit
d673a1ec
authored
Jul 22, 2016
by
c24b
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
METRIC SPEC
parent
60679e31
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
125 additions
and
0 deletions
+125
-0
metric_specificity.py
gargantext/util/toolchain/metric_specificity.py
+125
-0
No files found.
gargantext/util/toolchain/metric_specificity.py
0 → 100644
View file @
d673a1ec
"""
Computes a specificity metric from the ngram cooccurrence matrix.
+ SAVE => WeightedList => NodeNgram
"""
from
gargantext.models
import
Node
,
Ngram
,
NodeNgram
,
NodeNgramNgram
from
gargantext.util.db
import
session
,
aliased
,
func
,
bulk_insert
from
gargantext.util.lists
import
WeightedList
from
collections
import
defaultdict
from
pandas
import
DataFrame
import
pandas
as
pd
def
compute_specificity
(
corpus
,
cooc_id
=
None
,
cooc_matrix
=
None
,
overwrite_id
=
None
):
'''
Compute the specificity, simple calculus.
Parameters:
- cooc_id: mandatory id of a cooccurrences node to use as base
- overwrite_id: optional preexisting specificity node to overwrite
'''
matrix
=
defaultdict
(
lambda
:
defaultdict
(
float
))
if
cooc_id
==
None
and
cooc_matrix
==
None
:
raise
TypeError
(
"compute_specificity: needs a cooc_id or cooc_matrix param"
)
elif
cooc_id
:
cooccurrences
=
(
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
cooc_id
)
)
# no filtering: cooc already filtered on mainlist_id at creation
for
cooccurrence
in
cooccurrences
:
matrix
[
cooccurrence
.
ngram1_id
][
cooccurrence
.
ngram2_id
]
=
cooccurrence
.
weight
matrix
[
cooccurrence
.
ngram2_id
][
cooccurrence
.
ngram1_id
]
=
cooccurrence
.
weight
elif
cooc_matrix
:
# copy WeightedMatrix into local matrix structure
for
(
ngram1_id
,
ngram2_id
)
in
cooc_matrix
.
items
:
w
=
cooc_matrix
.
items
[(
ngram1_id
,
ngram2_id
)]
matrix
[
ngram1_id
][
ngram2_id
]
=
w
nb_ngrams
=
len
(
matrix
)
print
(
"SPECIFICITY: computing on
%
i ngrams"
%
nb_ngrams
)
x
=
DataFrame
(
matrix
)
.
fillna
(
0
)
# proba (x/y) ( <= on divise chaque ligne par son total)
x
=
x
/
x
.
sum
(
axis
=
1
)
# vectorisation
# d:Matrix => v: Vector (len = nb_ngrams)
# v = d.sum(axis=1) (- lui-même)
xs
=
x
.
sum
(
axis
=
1
)
-
x
ys
=
x
.
sum
(
axis
=
0
)
-
x
# top inclus ou exclus
#n = ( xs + ys) / (2 * (x.shape[0] - 1))
# top generic or specific (asc is spec, desc is generic)
v
=
(
xs
-
ys
)
/
(
2
*
(
x
.
shape
[
0
]
-
1
))
## d ##
#######
# Grenelle biodiversité kilomètres site élus île
# Grenelle 0 0 4 0 0 0
# biodiversité 0 0 0 0 4 0
# kilomètres 4 0 0 0 4 0
# site 0 0 0 0 4 6
# élus 0 4 4 4 0 0
# île 0 0 0 6 0 0
## d.sum(axis=1) ##
###################
# Grenelle 4
# biodiversité 4
# kilomètres 8
# site 10
# élus 12
# île 6
# résultat temporaire
# -------------------
# pour l'instant on va utiliser les sommes en ligne comme ranking de spécificité
# (**même** ordre qu'avec la formule d'avant le refactoring mais calcul + simple)
# TODO analyser la cohérence math ET sem de cet indicateur
#v.sort_values(inplace=True)
# [ ('biodiversité' , 0.333 ),
# ('Grenelle' , 0.5 ),
# ('île' , 0.599 ),
# ('kilomètres' , 1.333 ),
# ('site' , 1.333 ),
# ('élus' , 1.899 ) ]
# ----------------
# specificity node
if
overwrite_id
:
# overwrite pre-existing id
the_id
=
overwrite_id
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
the_id
)
.
delete
()
session
.
commit
()
else
:
specnode
=
corpus
.
add_child
(
typename
=
"SPECIFICITY"
,
name
=
"Specif (in:
%
s)"
%
corpus
.
id
)
session
.
add
(
specnode
)
session
.
commit
()
the_id
=
specnode
.
id
# print(v)
pd
.
options
.
display
.
float_format
=
'${:,.2f}'
.
format
if
not
v
.
empty
:
data
=
WeightedList
(
zip
(
v
.
index
.
tolist
()
,
v
.
values
.
tolist
()[
0
]
)
)
data
.
save
(
the_id
)
else
:
print
(
"WARNING: had no terms in COOCS => empty SPECIFICITY node"
)
return
(
the_id
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment