Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
89c8268c
Commit
89c8268c
authored
Mar 10, 2016
by
Romain Loth
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
prototype ngram toolchain in __init__.py (no mainlist nor maplist yet :/)
parent
61237884
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
368 additions
and
115 deletions
+368
-115
constants.py
gargantext/constants.py
+2
-0
lists.py
gargantext/util/lists.py
+3
-1
__init__.py
gargantext/util/toolchain/__init__.py
+47
-9
list_stop.py
gargantext/util/toolchain/list_stop.py
+33
-31
ngram_coocs_tempo.py
gargantext/util/toolchain/ngram_coocs_tempo.py
+164
-0
ngram_groups.py
gargantext/util/toolchain/ngram_groups.py
+1
-1
ngram_scores.py
gargantext/util/toolchain/ngram_scores.py
+49
-18
score_specificity.py
gargantext/util/toolchain/score_specificity.py
+69
-55
No files found.
gargantext/constants.py
View file @
89c8268c
...
@@ -92,6 +92,8 @@ RESOURCETYPES = [
...
@@ -92,6 +92,8 @@ RESOURCETYPES = [
# },
# },
]
]
# linguistic extraction parameters
DEFAULT_COOC_THRESHOLD
=
4
# other parameters
# other parameters
# default number of docs POSTed to scrappers.views.py
# default number of docs POSTed to scrappers.views.py
...
...
gargantext/util/lists.py
View file @
89c8268c
...
@@ -178,6 +178,8 @@ class WeightedContextIndex(_BaseClass):
...
@@ -178,6 +178,8 @@ class WeightedContextIndex(_BaseClass):
def
__init__
(
self
,
source
=
None
):
def
__init__
(
self
,
source
=
None
):
self
.
items
=
defaultdict
(
float
)
self
.
items
=
defaultdict
(
float
)
# £TODO
...
@@ -222,7 +224,7 @@ class WeightedMatrix(_BaseClass):
...
@@ -222,7 +224,7 @@ class WeightedMatrix(_BaseClass):
# insert new data
# insert new data
bulk_insert
(
bulk_insert
(
NodeNgramNgram
,
NodeNgramNgram
,
(
'node_id'
,
'ngram1_id'
,
'ngram2_id'
,
'
score
'
),
(
'node_id'
,
'ngram1_id'
,
'ngram2_id'
,
'
weight
'
),
((
node_id
,
key1
,
key2
,
value
)
for
key1
,
key2
,
value
in
self
)
((
node_id
,
key1
,
key2
,
value
)
for
key1
,
key2
,
value
in
self
)
)
)
...
...
gargantext/util/toolchain/__init__.py
View file @
89c8268c
from
.parsing
import
parse
from
.parsing
import
parse
from
.ngrams_extraction
import
extract_ngrams
from
.ngrams_extraction
import
extract_ngrams
from
.ngram_scores
import
compute_occurrences_local
,
compute_tfidf_local
from
.list_stop
import
compute_stop
from
.ngram_scores
import
compute_occurrences_local
,
compute_tfidf
from
.ngram_coocs_tempo
import
compute_coocs
from
.score_specificity
import
compute_specificity
from
.list_map
import
compute_mapList
# TEST
from
.ngram_groups
import
compute_groups
from
.ngram_groups
import
compute_groups
from
gargantext.util.db
import
session
from
gargantext.util.db
import
session
from
gargantext.models
import
Node
from
gargantext.models
import
Node
from
datetime
import
datetime
def
parse_extract
(
corpus
):
def
parse_extract
(
corpus
):
# retrieve corpus from database from id
# retrieve corpus from database from id
if
isinstance
(
corpus
,
int
):
if
isinstance
(
corpus
,
int
):
...
@@ -21,16 +28,47 @@ def parse_extract(corpus):
...
@@ -21,16 +28,47 @@ def parse_extract(corpus):
extract_ngrams
(
corpus
)
extract_ngrams
(
corpus
)
print
(
'CORPUS #
%
d: extracted ngrams'
%
(
corpus
.
id
))
print
(
'CORPUS #
%
d: extracted ngrams'
%
(
corpus
.
id
))
# -------------------------------
# temporary ngram lists workflow
# temporary ngram lists workflow
# -------------------------------
print
(
'CORPUS #
%
d: [
%
s] starting ngram lists computation'
%
(
corpus
.
id
,
t
()))
# -> stoplist: compute + write (=> Node and NodeNgram)
stop_id
=
compute_stop
(
corpus
)
print
(
'CORPUS #
%
d: [
%
s] new stoplist node #
%
i'
%
(
corpus
.
id
,
t
(),
stop_id
))
# -> write local tfidf to Node and NodeNodeNgram
ltfidf_id
=
compute_tfidf
(
corpus
,
scope
=
"local"
)
print
(
'CORPUS #
%
d: [
%
s] new localtfidf node #
%
i'
%
(
corpus
.
id
,
t
(),
ltfidf_id
))
# -> write global tfidf to Node and NodeNodeNgram
gtfidf_id
=
compute_tfidf
(
corpus
,
scope
=
"global"
)
print
(
'CORPUS #
%
d: [
%
s] new globaltfidf node #
%
i'
%
(
corpus
.
id
,
t
(),
gtfidf_id
))
#
write occurrences to Node and NodeNodeNgram
#
?? mainlist: compute + write (to Node and NodeNgram)
occnd_id
=
compute_occurrences_local
(
corpus
)
# mainlist_id = compute_mainlist
(corpus)
print
(
'CORPUS #
%
d: new occs node #
%
i'
%
(
corpus
.
id
,
occnd
_id
))
# print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist
_id))
#
write local tfidf to Node and NodeNodeNgram
#
-> cooccurrences: compute + write (=> Node and NodeNodeNgram)
ltfidf_id
=
compute_tfidf_local
(
corpus
)
cooc_id
=
compute_coocs
(
corpus
,
stop_id
=
None
)
print
(
'CORPUS #
%
d:
new localtfidf node #
%
i'
%
(
corpus
.
id
,
ltfidf
_id
))
print
(
'CORPUS #
%
d:
[
%
s] new cooccs node #
%
i'
%
(
corpus
.
id
,
t
(),
cooc
_id
))
# write groups to Node and NodeNgramNgram
# ?? specificity: compute + write (=> NodeNodeNgram)
spec_id
=
compute_specificity
(
cooc_id
=
cooc_id
,
corpus
=
corpus
)
print
(
'CORPUS #
%
d: [
%
s] new specificity node #
%
i'
%
(
corpus
.
id
,
t
(),
cooc_id
))
# ?? maplist: compute + write (to Node and NodeNgram)
# map_id = compute_stop(corpus)
# print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id))
# -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
occ_id
=
compute_occurrences_local
(
corpus
)
print
(
'CORPUS #
%
d: [
%
s] new occs node #
%
i'
%
(
corpus
.
id
,
t
(),
occ_id
))
# -> write groups to Node and NodeNgramNgram
group_id
=
compute_groups
(
corpus
,
stoplist_id
=
None
)
group_id
=
compute_groups
(
corpus
,
stoplist_id
=
None
)
print
(
'CORPUS #
%
d: new grouplist node #
%
i'
%
(
corpus
.
id
,
group_id
))
print
(
'CORPUS #
%
d: [
%
s] new grouplist node #
%
i'
%
(
corpus
.
id
,
t
(),
group_id
))
def
t
():
return
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d_
%
H:
%
M:
%
S"
)
gargantext/util/toolchain/list_stop.py
View file @
89c8268c
...
@@ -2,15 +2,13 @@ from gargantext.util.db import *
...
@@ -2,15 +2,13 @@ from gargantext.util.db import *
from
gargantext.util.db_cache
import
*
from
gargantext.util.db_cache
import
*
from
gargantext.constants
import
*
from
gargantext.constants
import
*
from
gargantext.models.users
import
User
from
gargantext.util.db
import
session
,
aliased
,
func
from
gargantext.models.nodes
import
Node
from
gargantext.util.lists
import
WeightedMatrix
from
gargantext.models.ngrams
import
Ngram
,
NodeNgram
from
gargantext.models
import
User
,
Node
,
Ngram
,
NodeNgram
import
re
import
re
import
sqlalchemy
as
sa
from
sqlalchemy
import
desc
,
asc
from
sqlalchemy.sql
import
func
from
sqlalchemy.orm
import
aliased
from
sqlalchemy
import
desc
,
asc
,
or_
,
and_
,
Date
,
cast
,
select
,
literal_column
#from ngram.tools import insert_ngrams
#from ngram.tools import insert_ngrams
def
isStopWord
(
ngram
,
stop_words
=
None
):
def
isStopWord
(
ngram
,
stop_words
=
None
):
...
@@ -23,20 +21,16 @@ def isStopWord(ngram, stop_words=None):
...
@@ -23,20 +21,16 @@ def isStopWord(ngram, stop_words=None):
if
word
in
stop_words
:
if
word
in
stop_words
:
return
(
True
)
return
(
True
)
def
test_match
(
word
,
regex
):
format_regex
=
re
.
compile
(
regex
)
if
format_regex
.
match
(
word
)
:
return
(
True
)
compiled_regexes
=
[]
# to compile them only once
for
regex
in
[
for
regex
in
[
"^.{1,2}$"
"^.{1,2}$"
,
"(.*)
\
d(.*)"
,
"(.*)
\
d(.*)"
,
"(.*)(
\
.)(.*)"
# , "(.*)(\.)(.*)" trop fort (enlève les sigles !)
,
"(.*)(
\
,)(.*)"
,
"(.*)(
\
,)(.*)"
,
"(.*)(< ?/?p ?>)(.*)"
# marques de paragraphes
,
"(.*)(< ?/?p ?>)(.*)"
# marques de paragraphes
,
"(.*)(study)(.*)"
,
"(.*)(study)(.*)"
,
"(.*)
(xx|xi|xv)
(.*)"
,
"(.*)
\b
(xx|xi|xv)
\b
(.*)"
,
"(.*)(result)(.*)"
,
"(.*)(result)(.*)"
,
"(.*)(année|nombre|moitié)(.*)"
,
"(.*)(année|nombre|moitié)(.*)"
,
"(.*)(temps)(.*)"
,
"(.*)(temps)(.*)"
...
@@ -47,9 +41,15 @@ def isStopWord(ngram, stop_words=None):
...
@@ -47,9 +41,15 @@ def isStopWord(ngram, stop_words=None):
,
"(.*)(travers)(.*)"
,
"(.*)(travers)(.*)"
,
"(.*)(:|
\
|)(.*)"
,
"(.*)(:|
\
|)(.*)"
]
:
]
:
if
test_match
(
word
,
regex
)
is
True
:
compiled_regexes
.
append
(
re
.
compile
(
regex
))
for
format_regex
in
compiled_regexes
:
if
format_regex
.
match
(
word
):
# print("STOPLIST += '%s' (regex: %s)" % (word, format_regex.pattern))
return
(
True
)
return
(
True
)
return
False
def
create_gargantua_resources
():
def
create_gargantua_resources
():
gargantua_id
=
session
.
query
(
User
.
id
)
.
filter
(
User
.
username
==
"gargantua"
)
.
first
()
gargantua_id
=
session
.
query
(
User
.
id
)
.
filter
(
User
.
username
==
"gargantua"
)
.
first
()
project
=
Node
(
project
=
Node
(
...
@@ -61,32 +61,33 @@ def create_gargantua_resources():
...
@@ -61,32 +61,33 @@ def create_gargantua_resources():
session
.
add
(
stopList
)
session
.
add
(
stopList
)
session
.
commit
()
session
.
commit
()
def
compute_stop
(
corpus
_id
,
stopList_id
=
None
,
limit
=
2000
,
debug
=
False
):
def
compute_stop
(
corpus
,
stopList_id
=
None
,
debug
=
False
):
'''
'''
Create list of stop words.
Create list of stop words.
TODO do a function to get all stop words with social scores
TODO do a function to get all stop words with social scores
'''
'''
# Get the StopList if it exist or create a new one
# Get the StopList if it exist or create a new one
# At this step of development, a new StopList should be created
# At this step of development, a new StopList should be created
if
stopList_id
==
None
:
if
stopList_id
==
None
:
stopList_id
=
session
.
query
(
Node
.
id
)
.
filter
(
stopList_id
=
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
_
id
,
Node
.
parent_id
==
corpus
.
id
,
Node
.
typename
==
"STOPLIST"
Node
.
typename
==
"STOPLIST"
)
.
first
()
)
.
first
()
if
stopList_id
==
None
:
if
stopList_id
==
None
:
corpus
=
cache
.
Node
[
corpus_id
]
stopList
=
Node
(
name
=
"STOPLIST"
,
user_id
=
corpus
.
user_id
parent_id
=
corpus
.
id
,
stopList
=
Node
(
name
=
"STOPLIST"
,
parent_id
=
corpus_id
,
user_id
=
user_id
,
typename
=
"STOPLIST"
)
user_id
=
corpus
.
user_id
,
typename
=
"STOPLIST"
)
session
.
add
(
stopList
)
session
.
add
(
stopList
)
session
.
commit
()
session
.
commit
()
stopList_id
=
stopList
.
id
stopList_id
=
stopList
.
id
# For tests only
# For tests only
if
debug
==
True
:
if
debug
==
True
:
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
stopList_id
)
.
delete
()
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
stopList_id
)
.
delete
()
session
.
commit
()
session
.
commit
()
# Get common resources, all common StopWords on the platform
# Get common resources, all common StopWords on the platform
## First get the id of the StopList of Gargantua super user
## First get the id of the StopList of Gargantua super user
gargantua_id
=
session
.
query
(
User
.
id
)
.
filter
(
User
.
username
==
"gargantua"
)
.
first
()
gargantua_id
=
session
.
query
(
User
.
id
)
.
filter
(
User
.
username
==
"gargantua"
)
.
first
()
...
@@ -101,16 +102,16 @@ def compute_stop(corpus_id,stopList_id=None,limit=2000,debug=False):
...
@@ -101,16 +102,16 @@ def compute_stop(corpus_id,stopList_id=None,limit=2000,debug=False):
.
filter
(
NodeNgram
.
node_id
==
rootStopList_id
)
.
filter
(
NodeNgram
.
node_id
==
rootStopList_id
)
.
all
()
.
all
()
)
)
print
([
n
for
n
in
stop_words
])
# print([n for n in stop_words])
## Get the ngrams
## Get the ngrams
## ngrams :: [(Int, String, Int)]
## ngrams :: [(Int, String, Int)]
frequency
=
sa
.
func
.
count
(
NodeNgram
.
weight
)
frequency
=
func
.
count
(
NodeNgram
.
weight
)
ngrams
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
,
frequency
)
ngrams
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
,
frequency
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
join
(
Node
,
Node
.
id
==
NodeNgram
.
node_id
)
.
join
(
Node
,
Node
.
id
==
NodeNgram
.
node_id
)
.
filter
(
Node
.
parent_id
==
corpus
_
id
,
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
Node
.
typename
==
"DOCUMENT"
)
Node
.
typename
==
"DOCUMENT"
)
.
group_by
(
Ngram
.
id
)
.
group_by
(
Ngram
.
id
)
.
order_by
(
desc
(
frequency
)
)
.
order_by
(
desc
(
frequency
)
)
...
@@ -119,9 +120,10 @@ def compute_stop(corpus_id,stopList_id=None,limit=2000,debug=False):
...
@@ -119,9 +120,10 @@ def compute_stop(corpus_id,stopList_id=None,limit=2000,debug=False):
)
)
ngrams_to_stop
=
filter
(
lambda
x
:
isStopWord
(
x
,
stop_words
=
stop_words
),
ngrams
)
ngrams_to_stop
=
filter
(
lambda
x
:
isStopWord
(
x
,
stop_words
=
stop_words
),
ngrams
)
print
([
n
for
n
in
ngrams_to_stop
])
#
print([n for n in ngrams_to_stop])
stop
=
LISTTYPES
[
"STOPLIST"
]({
n
[
0
]
:
-
1
for
n
in
ngrams_to_stop
})
stop
=
LISTTYPES
[
"STOPLIST"
]({
n
[
0
]
:
-
1
for
n
in
ngrams_to_stop
})
# stop = LISTTYPES["STOPLIST"]([n[0] for n in ngrams_to_stop])
stop
.
save
(
stopList_id
)
stop
.
save
(
stopList_id
)
#
return
stopList_id
gargantext/util/toolchain/ngram_coocs_tempo.py
0 → 100644
View file @
89c8268c
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNgramNgram
from
gargantext.util.lists
import
WeightedMatrix
from
gargantext.util.db
import
session
,
aliased
,
func
from
gargantext.constants
import
DEFAULT_COOC_THRESHOLD
def
compute_coocs
(
corpus
,
threshold
=
DEFAULT_COOC_THRESHOLD
,
weighted
=
False
,
our_id
=
None
,
stop_id
=
None
,
symmetry_filter
=
True
):
"""
Count how often some extracted terms appear
together in a small context (document)
throughout a larger context (corpus).
node_id | ngram_id | weight ngram1_id | ngram2_id | ucooc | wcooc |
--------+----------+-------- ----------+-----------+-------+-------+
MYDOC | 487 | 1 => 487 | 294 | 1 | 4 |
MYDOC | 294 | 3
Fill that info in DB:
- a *new* COOCCURRENCES node
- and all corresponding NodeNgramNgram rows
worse case complexity ~ O(N²/2) with N = number of ngrams
Parameters:
- threshold: on output ucooc count (previously called hapax)
- weighted: if False normal cooc to be saved as result
if True weighted cooc (experimental)
- stop_id: stoplist for filtering input ngrams
- TODO cvalue_id: allow a metric as input filter
- TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
- TODO start, end : filter on document date
(deprecated parameters)
- field1,2: allowed to count other things than ngrams (eg tags) but no use case at present
- isMonopartite: ?? used a nodes_hyperdata_ngrams table ???
basic idea for one doc
======================
each pair of ngrams sharing same doc (node_id)
SELEC idx1.ngram_id, idx2.ngram_id
FROM nodes_ngrams AS idx1, nodes_ngrams AS idx2
---------------------------------
WHERE idx1.node_id = idx2.node_id <== that's cooc
---------------------------------
AND idx1.ngram_id <> idx2.ngram_id
AND idx1.node_id = MY_DOC ;
on entire corpus
=================
coocs for each doc :
- each given pair like (termA, termB) will likely appear several times
=> we do GROUP BY (x1.ngram_id, x2.ngram_id)
- normally we can count unique appearances of the pair (ucooc)
- we can count sum of sum of weights in the pair (wcooc or cofreq)
TODO
====
use WeightedMatrix
"""
# /!\ big combinatorial complexity /!\
# pour 8439 lignes dans l'index nodes_ngrams dont 1442 avec occ > 1
# 1.859.408 lignes pour la requête cooc simple
# 71.134 lignes en se limitant aux ngrammes qui ont une occ > 1 (weight)
# docs of our corpus
docids_subquery
=
(
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
subquery
()
)
# 2 x the occurrence index table
x1
=
aliased
(
NodeNgram
)
x2
=
aliased
(
NodeNgram
)
# cooccurrences columns definition
ucooc
=
func
.
count
(
x1
.
ngram_id
)
.
label
(
"ucooc"
)
# 1) MAIN DB QUERY
coocs_query
=
(
session
.
query
(
x1
.
ngram_id
,
x2
.
ngram_id
,
ucooc
)
.
filter
(
x1
.
node_id
==
x2
.
node_id
)
# <- by definition of cooc
.
filter
(
x1
.
ngram_id
!=
x2
.
ngram_id
)
# <- b/c not with itself
.
filter
(
x1
.
node_id
.
in_
(
docids_subquery
))
# <- b/c within corpus
.
group_by
(
x1
.
ngram_id
,
x2
.
ngram_id
)
)
# 2) INPUT FILTERS (reduce N before O(N²))
# £TODO add possibility to restrict to the mainlist
if
stop_id
:
stop_subquery
=
(
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
stop_id
)
.
subquery
()
)
coocs_query
=
(
coocs_query
.
filter
(
~
x1
.
ngram_id
.
in_
(
stop_subquery
)
)
.
filter
(
~
x2
.
ngram_id
.
in_
(
stop_subquery
)
)
)
if
symmetry_filter
:
# 1 filtre tenant en compte de la symétrie
# -> réduit le travail de moitié !!
# -> mais empêchera l'accès direct aux cooccurrences de x2
# -> seront éparpillées: notées dans les x1 qui ont précédé x2
# -> récupération sera plus couteuse via des requêtes OR comme:
# WHERE ngram1 = mon_ngram OR ngram2 = mon_ngram
coocs_query
=
coocs_query
.
filter
(
x1
.
ngram_id
<
x2
.
ngram_id
)
# ------------
# 2 filtres amont possibles pour réduire combinatoire
# - par exemple 929k lignes => 35k lignes
# - ici sur weight mais dégrade les résultats
# => imaginable sur une autre métrique (cvalue ou tfidf?)
# coocs_query = coocs_query.filter(x1.weight > 1)
# coocs_query = coocs_query.filter(x2.weight > 1)
# ------------
# 3) OUTPUT FILTERS
# ------------------
# threshold
#
coocs_query
=
coocs_query
.
having
(
ucooc
>
threshold
)
# 4) EXECUTE QUERY
# ----------------
# => storage in our matrix structure
matrix
=
WeightedMatrix
(
coocs_query
.
all
())
# 5) SAVE
# --------
if
our_id
:
# use pre-existing id
the_id
=
our_id
else
:
# create the new cooc node
the_cooc
=
Node
(
typename
=
"COOCCURRENCES"
,
name
=
"Coocs (in:
%
s)"
%
corpus
.
name
[
0
:
10
],
parent_id
=
corpus
.
id
,
user_id
=
corpus
.
user_id
,
# saving the parameters of the analysis in the Node JSON
hyperdata
=
{
'corpus'
:
corpus
.
id
,
'threshold'
:
threshold
}
)
session
.
add
(
the_cooc
)
session
.
commit
()
the_id
=
the_cooc
.
id
# ==> save all NodeNgramNgram with link to new cooc node id
matrix
.
save
(
the_id
)
return
the_id
gargantext/util/toolchain/ngram_groups.py
View file @
89c8268c
...
@@ -32,7 +32,7 @@ def compute_groups(corpus, stoplist_id = None):
...
@@ -32,7 +32,7 @@ def compute_groups(corpus, stoplist_id = None):
stop_ngrams_ids
=
{}
stop_ngrams_ids
=
{}
# we will need the ngrams of the stoplist to filter
# we will need the ngrams of the stoplist to filter
if
stoplist_id
is
not
None
:
if
stoplist_id
is
not
None
:
for
id
in
session
.
query
(
NodeNgram
.
id
)
.
filter
(
NodeNgram
.
node_id
==
stoplist_id
)
.
all
():
for
id
in
session
.
query
(
NodeNgram
.
ngram_
id
)
.
filter
(
NodeNgram
.
node_id
==
stoplist_id
)
.
all
():
stop_ngrams_ids
[
id
[
0
]]
=
True
stop_ngrams_ids
[
id
[
0
]]
=
True
...
...
gargantext/util/toolchain/ngram_scores.py
View file @
89c8268c
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNodeNgram
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNodeNgram
from
gargantext.util.db
import
session
,
bulk_insert
from
gargantext.util.db
import
session
,
bulk_insert
from
sqlalchemy
import
text
# £TODO
# £TODO
# from gargantext.util.lists import WeightedContextIndex
# from gargantext.util.lists import WeightedContextIndex
...
@@ -57,19 +57,48 @@ def compute_occurrences_local(corpus):
...
@@ -57,19 +57,48 @@ def compute_occurrences_local(corpus):
return
occnode
.
id
return
occnode
.
id
def
compute_tfidf
_local
(
corpus
):
def
compute_tfidf
(
corpus
,
scope
=
"local"
):
"""
"""
Calculates tfidf within the current corpus
Calculates tfidf within the current corpus
"""
# ?? FIXME could we keep the docids somehow from previous computations ??
Parameter:
docids_subquery
=
(
session
- scope: {"local" or "global"}
.
query
(
Node
.
id
)
"""
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
subquery
()
)
# local <=> within this corpus
if
scope
==
"local"
:
# All docs of this corpus
docids_subquery
=
(
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
subquery
()
)
# global <=> within all corpora of this source
elif
scope
==
"global"
:
this_source_type
=
corpus
.
resources
()[
0
][
'type'
]
# all corpora with the same source type
# (we need raw SQL query for postgres JSON operators) (TODO test speed)
same_source_corpora_query
=
(
session
.
query
(
Node
.
id
)
.
from_statement
(
text
(
"""
SELECT id FROM nodes
WHERE hyperdata->'resources' @> '[{
\"
type
\"
\
:
%
s}]'
"""
%
this_source_type
))
)
# All docs **in all corpora of the same source**
docids_subquery
=
(
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
parent_id
.
in_
(
same_source_corpora_query
))
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
subquery
()
)
# N
total_docs
=
session
.
query
(
docids_subquery
)
.
count
()
total_docs
=
session
.
query
(
docids_subquery
)
.
count
()
# or perhaps at least do the occurrences right now at the same time
# or perhaps at least do the occurrences right now at the same time
...
@@ -93,12 +122,14 @@ def compute_tfidf_local(corpus):
...
@@ -93,12 +122,14 @@ def compute_tfidf_local(corpus):
# -------------------------------------------------
# -------------------------------------------------
# create the new TFIDF-CORPUS node
# create the new TFIDF-CORPUS node
ltfidf
=
Node
()
tfidf_nd
=
Node
(
parent_id
=
corpus
.
id
,
user_id
=
corpus
.
user_id
)
ltfidf
.
typename
=
"TFIDF-CORPUS"
if
scope
==
"local"
:
ltfidf
.
name
=
"tfidf (in:
%
s)"
%
corpus
.
id
tfidf_nd
.
typename
=
"TFIDF-CORPUS"
ltfidf
.
parent_id
=
corpus
.
id
tfidf_nd
.
name
=
"tfidf-c (in:
%
s)"
%
corpus
.
id
ltfidf
.
user_id
=
corpus
.
user_id
elif
scope
==
"global"
:
session
.
add
(
ltfidf
)
tfidf_nd
.
typename
=
"TFIDF-GLOBAL"
tfidf_nd
.
name
=
"tfidf-g (in type:
%
s)"
%
this_source_type
session
.
add
(
tfidf_nd
)
session
.
commit
()
session
.
commit
()
# reflect that in NodeNodeNgrams
# reflect that in NodeNodeNgrams
...
@@ -106,7 +137,7 @@ def compute_tfidf_local(corpus):
...
@@ -106,7 +137,7 @@ def compute_tfidf_local(corpus):
bulk_insert
(
bulk_insert
(
NodeNodeNgram
,
NodeNodeNgram
,
(
'node1_id'
,
'node2_id'
,
'ngram_id'
,
'score'
),
(
'node1_id'
,
'node2_id'
,
'ngram_id'
,
'score'
),
((
ltfidf
.
id
,
corpus
.
id
,
ng
,
tfidfs
[
ng
])
for
ng
in
tfidfs
)
((
tfidf_nd
.
id
,
corpus
.
id
,
ng
,
tfidfs
[
ng
])
for
ng
in
tfidfs
)
)
)
return
ltfidf
.
id
return
tfidf_nd
.
id
gargantext/util/toolchain/score_specificity.py
View file @
89c8268c
from
gargantext.util.db
import
*
from
gargantext.util.db
import
session
,
aliased
,
func
from
gargantext.util.db_cache
import
*
from
gargantext.util.db_cache
import
*
from
gargantext.constants
import
*
from
gargantext.constants
import
*
from
gargantext.util.analysis.cooccurrences
import
do_cooc
#
from gargantext.util.analysis.cooccurrences import do_cooc
from
gargantext.models.ngrams
import
Ngram
,
NodeNgram
,
\
from
gargantext.models
import
Node
,
Ngram
,
NodeNgramNgram
,
NodeNodeNgram
NodeNgramNgram
,
NodeNodeNgram
import
numpy
as
np
import
pandas
as
pd
import
pandas
as
pd
from
collections
import
defaultdict
from
collections
import
defaultdict
from
sqlalchemy
import
desc
,
asc
,
or_
,
and_
,
Date
,
cast
,
select
def
specificity
(
cooc_id
=
None
,
corpus
=
None
,
limit
=
100
,
session
=
None
):
def
compute_specificity
(
corpus
,
cooc_id
,
limit
=
100
):
'''
'''
Compute the specificity, simple calculus.
Compute the specificity, simple calculus.
'''
'''
cooccurrences
=
(
session
.
query
(
NodeNgramNgram
)
cooccurrences
=
(
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
cooc_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
cooc_id
)
.
order_by
(
NodeNgramNgram
.
score
)
# no filtering: new choice filter on tfidf before creation
.
limit
(
limit
)
# .order_by(NodeNgramNgram.weight)
# .limit(limit)
)
)
matrix
=
defaultdict
(
lambda
:
defaultdict
(
float
))
matrix
=
defaultdict
(
lambda
:
defaultdict
(
float
))
# £TODO re-rename weight => score
for
cooccurrence
in
cooccurrences
:
for
cooccurrence
in
cooccurrences
:
matrix
[
cooccurrence
.
ngramx_id
][
cooccurrence
.
ngramy_id
]
=
cooccurrence
.
score
matrix
[
cooccurrence
.
ngram1_id
][
cooccurrence
.
ngram2_id
]
=
cooccurrence
.
weight
matrix
[
cooccurrence
.
ngramy_id
][
cooccurrence
.
ngramx_id
]
=
cooccurrence
.
score
matrix
[
cooccurrence
.
ngram2_id
][
cooccurrence
.
ngram1_id
]
=
cooccurrence
.
weight
x
=
pd
.
DataFrame
(
matrix
)
.
fillna
(
0
)
nb_ngrams
=
len
(
matrix
)
x
=
x
/
x
.
sum
(
axis
=
1
)
d
=
pd
.
DataFrame
(
matrix
)
.
fillna
(
0
)
xs
=
x
.
sum
(
axis
=
1
)
ys
=
x
.
sum
(
axis
=
0
)
# proba (x/y) ( <= on divise chaque colonne par son total)
d
=
d
/
d
.
sum
(
axis
=
0
)
m
=
(
xs
-
ys
)
/
(
2
*
(
x
.
shape
[
0
]
-
1
))
m
=
m
.
sort
(
inplace
=
False
)
# d:Matrix => v: Vector (len = nb_ngrams)
v
=
d
.
sum
(
axis
=
1
)
#node = get_or_create_node(nodetype='Specificity',corpus=corpus)
## d ##
#######
# Grenelle biodiversité kilomètres site élus île
# Grenelle 0 0 4 0 0 0
# biodiversité 0 0 0 0 4 0
# kilomètres 4 0 0 0 4 0
# site 0 0 0 0 4 6
# élus 0 4 4 4 0 0
# île 0 0 0 6 0 0
## d.sum(axis=1) ##
###################
# Grenelle 4
# biodiversité 4
# kilomètres 8
# site 10
# élus 12
# île 6
# résultat temporaire
# -------------------
# pour l'instant on va utiliser les sommes en ligne comme ranking de spécificité
# (**même** ordre qu'avec la formule d'avant le refactoring mais calcul + simple)
# TODO analyser la cohérence math ET sem de cet indicateur
v
.
sort_values
(
inplace
=
True
)
# [ ('biodiversité' , 0.333 ),
# ('Grenelle' , 0.5 ),
# ('île' , 0.599 ),
# ('kilomètres' , 1.333 ),
# ('site' , 1.333 ),
# ('élus' , 1.899 ) ]
# ----------------
# specificity node
node
=
session
.
query
(
Node
)
.
filter
(
node
=
session
.
query
(
Node
)
.
filter
(
Node
.
parent_id
==
corpus
_
id
,
Node
.
parent_id
==
corpus
.
id
,
Node
.
typename
==
"SPECIFICITY"
Node
.
typename
==
"SPECIFICITY"
)
.
first
()
)
.
first
()
if
node
==
None
:
if
node
==
None
:
corpus
=
cache
.
Node
[
corpus_id
]
user_id
=
corpus
.
user_id
user_id
=
corpus
.
user_id
node
=
Node
(
name
=
"SPECIFICITY"
,
parent_id
=
corpus_id
,
user_id
=
user_id
,
typename
=
"SPECIFICITY"
)
node
=
Node
(
name
=
"Specif (in:
%
i)"
%
corpus
.
id
,
parent_id
=
corpus
.
id
,
user_id
=
user_id
,
typename
=
"SPECIFICITY"
)
session
.
add
(
node
)
session
.
add
(
node
)
session
.
commit
()
session
.
commit
()
data
=
zip
(
[
node
.
id
]
*
nb_ngrams
data
=
zip
(
[
node
.
id
for
i
in
range
(
1
,
m
.
shape
[
0
])]
,
[
corpus
.
id
]
*
nb_ngrams
,
[
corpus
.
id
for
i
in
range
(
1
,
m
.
shape
[
0
])]
,
v
.
index
.
tolist
()
,
m
.
index
.
tolist
()
,
v
.
values
.
tolist
()
,
m
.
values
.
tolist
()
)
)
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
node
x
_id
==
node
.
id
)
.
delete
()
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
node
1
_id
==
node
.
id
)
.
delete
()
session
.
commit
()
session
.
commit
()
bulk_insert
(
NodeNodeNgram
,
[
'node
x_id'
,
'nodey
_id'
,
'ngram_id'
,
'score'
],
[
d
for
d
in
data
])
bulk_insert
(
NodeNodeNgram
,
[
'node
1_id'
,
'node2
_id'
,
'ngram_id'
,
'score'
],
[
d
for
d
in
data
])
return
(
node
.
id
)
return
(
node
.
id
)
def
compute_specificity
(
corpus
,
limit
=
100
,
session
=
None
):
'''
Computing specificities as NodeNodeNgram.
All workflow is the following:
1) Compute the cooc matrix
2) Compute the specificity score, saving it in database, return its Node
'''
#dbg = DebugTime('Corpus #%d - specificity' % corpus.id)
#list_cvalue = get_or_create_node(nodetype='Cvalue', corpus=corpus)
cooc_id
=
do_cooc
(
corpus
=
corpus
,
cvalue_id
=
list_cvalue
.
id
,
limit
=
limit
)
specificity
(
cooc_id
=
cooc_id
,
corpus
=
corpus
,
limit
=
limit
,
session
=
session
)
#dbg.show('specificity')
#corpus=session.query(Node).filter(Node.id==244250).first()
#compute_specificity(corpus)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment