Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
b33f37eb
Commit
b33f37eb
authored
Sep 29, 2015
by
delanoe
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEAT] Generic cooccurrence function with miam_id, stop_id, group_id.
parent
38556c56
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
123 additions
and
200 deletions
+123
-200
cooccurrences.py
analysis/cooccurrences.py
+107
-86
db.py
gargantext_web/db.py
+1
-0
specificity.py
ngram/specificity.py
+13
-112
workflow.py
ngram/workflow.py
+2
-2
No files found.
analysis/cooccurrences.py
View file @
b33f37eb
from
env
import
*
from
admin.utils
import
PrintException
from
gargantext_web.db
import
NodeNgram
from
gargantext_web.db
import
*
from
parsing.corpustools
import
*
import
sqlalchemy
from
sqlalchemy.sql
import
func
from
sqlalchemy
import
desc
,
asc
,
or_
,
and_
,
Date
,
cast
,
select
from
sqlalchemy
import
literal_column
from
sqlalchemy.orm
import
aliased
from
sqlalchemy.sql
import
func
from
gargantext_web.db
import
Node
,
NodeNgram
,
NodeNgramNgram
,
NodeNodeNgram
,
NodeHyperdata
,
Hyperdata
from
gargantext_web.db
import
session
,
cache
,
get_or_create_node
,
bulk_insert
from
analysis.lists
import
WeightedMatrix
,
UnweightedList
,
Translations
# from gargantext_web.db import Node, get_cursor
def
cooc
(
corpus
=
None
,
miam_id
=
None
,
stop_id
=
None
,
group_id
=
None
,
start
=
None
,
end
=
None
,
limit
=
1000
):
'''
Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
For the moment list of paramters are not supported because, lists need to
be merged before.
corpus :: Corpus
miam_id :: Int
stop_id :: Int
group_id :: Int
For the moment, start and ens are simple, only year is implemented yet
start :: TimeStamp -- example: '2010-05-30 02:00:00+02'
end :: TimeStamp
limit :: Int
def
cooccurrences
(
user_id
=
None
,
corpus_id
=
None
,
mainlist_id
=
None
,
stoplist_id
=
None
,
lem
=
False
,
stem
=
True
,
cvalue
=
False
,
date_begin
=
None
,
date_end
=
None
,
size
=
10
,
n_min
=
2
,
n_max
=
3
):
'''
Function to create a cooccurrence Node
---------------------------------------------------
cooccurrences :: [Text] -> [Word] -> [[Word]]
node_cooc
=
get_or_create_node
(
nodetype
=
'Cooccurrence'
,
corpus
=
corpus
,
name_str
=
"Cooccurrences corpus "
+
str
(
corpus
.
id
)
+
"list_id: "
+
str
(
miam_id
)
)
user_id :: Integer, User.id who creates the cooccurrence matrix
corpus_id :: Integer, Node.id with NodeType "Corpus"
# TODO : save parameters in Node
# args, _, _, parameters = inspect.getargvalues(inspect.currentframe())
# print(parameters)
# for parameter in parameters.keys():
# print(parameters[parameter])
# node_cooc.hyperdata[parameter] = parameters[parameter]
#
# session.add(node_cooc)
# session.commit()
# print(node_cooc.hyperdata)
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
node_cooc
.
id
)
.
delete
()
session
.
commit
()
miamlist_id :: Integer, Node.id with NodeType "MiamList" and with parent_id=corpus_id
stoplist_id :: Integer, Node.id with NodeType "StopList" and with parent_id=corpus_id
mainlist_id :: Integer, Node.id with NodeType "MainList" and with parent_id=corpus_id
NodeNgramX
=
aliased
(
NodeNgram
)
NodeNgramY
=
aliased
(
NodeNgram
)
lem :: False | True, if lemmatization should be taken into account
stem :: False | True, if stemmatization should be taken into account
cvalue :: False | True, if cvalue should be taken into account
group :: False | True, if manual groups should be taken into account
doc_id
=
cache
.
NodeType
[
'Document'
]
.
id
date_begin :: Datetime, format YYYY-MM-DD, begin of corpus splitted by date
date_end :: Datetime, format YYYY-MM-DD, end of corpus splitted by date
cooc_query
=
(
session
.
query
(
NodeNgramX
.
ngram_id
,
NodeNgramY
.
ngram_id
,
func
.
count
())
.
join
(
Node
,
Node
.
id
==
NodeNgramX
.
node_id
)
.
join
(
NodeNgramY
,
NodeNgramY
.
node_id
==
Node
.
id
)
)
size :: Integer, size of the cooccurrence list
n_min :: Integer, minimal ngram's size of n
n_max :: Integer, maximal ngram's size of n
'''
# We create a new node of Type cooccurrence
if
corpus_id
is
not
None
and
user_id
is
not
None
:
node_cooc
=
session
.
query
(
Node
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
Node
.
type_id
==
cache
.
NodeType
[
'Cooccurrence'
]
.
id
)
.
first
()
if
node_cooc
is
None
:
node_cooc
=
Node
(
user_id
=
user_id
,
parent_id
=
corpus_id
,
type_id
=
cache
.
NodeType
[
'Cooccurrence'
]
.
id
,
name
=
"Cooccurrences corpus "
+
str
(
corpus_id
))
session
.
add
(
node_cooc
)
session
.
commit
()
else
:
print
(
"Usage (Warning): Need corpus_id and user_id"
)
# Getting the main lists here, by default create or take the first one.
# Getting nodes for lems, stems and cvalue, if needed.
if
stem
is
True
:
node_stem
=
session
.
query
(
Node
)
.
filter
(
Node
.
type_id
==
cache
.
NodeType
[
'Stem'
]
.
id
)
.
first
()
miamNgram
=
aliased
(
NodeNgram
)
stopNgram
=
aliased
(
NodeNgram
)
groupNgram
=
aliased
(
NodeNgramNgram
)
stemNgram
=
aliased
(
NodeNgramNgram
)
lemNgram
=
aliased
(
NodeNgramNgram
)
cvalueNgram
=
aliased
(
NodeNgramNgram
)
# Literal query here
query
=
(
session
.
query
(
Node
.
id
,
Ngram
.
id
.
label
(
'x'
),
Ngram
.
id
.
label
(
'y'
),
func
.
count
()
.
label
(
'score'
))
.
join
(
NodeNgram
,
NodeNgram
.
node_id
==
Node
.
id
)
#.outerjoin(stopNgram, stopNgram.ngram_id == Ngram.id)
.
filter
(
Node
.
parent_id
==
corpus_id
)
.
filter
(
Node
.
type_id
==
cache
.
NodeType
[
'Document'
]
.
id
)
#.filter(Ngram.n > n_max)
#.group_by(x)
#.group_by(y)
#.limit(size)
.
all
()
if
start
is
not
None
:
Start
=
aliased
(
NodeHyperdata
)
StartFormat
=
aliased
(
Hyperdata
)
cooc_query
=
(
cooc_query
.
join
(
Start
,
Start
.
node_id
==
Node
.
id
)
.
join
(
StartFormat
,
StartFormat
.
id
==
Start
.
hyperdata_id
)
.
filter
(
StartFormat
.
name
==
'datetime'
)
.
filter
(
Start
.
value_datetime
>=
start
)
)
if
end
is
not
None
:
End
=
aliased
(
NodeHyperdata
)
EndFormat
=
aliased
(
Hyperdata
)
cooc_query
=
(
cooc_query
.
join
(
End
,
End
.
node_id
==
Node
.
id
)
.
join
(
EndFormat
,
EndFormat
.
id
==
End
.
hyperdata_id
)
.
filter
(
EndFormat
.
name
==
'datetime'
)
.
filter
(
End
.
value_datetime
<=
end
)
)
cooc_query
=
(
cooc_query
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
Node
.
type_id
==
doc_id
)
.
filter
(
NodeNgramX
.
ngram_id
<
NodeNgramY
.
ngram_id
)
.
group_by
(
NodeNgramX
.
ngram_id
,
NodeNgramY
.
ngram_id
)
.
order_by
(
func
.
count
())
.
limit
(
limit
)
)
matrix
=
WeightedMatrix
(
cooc_query
)
if
miam_id
is
not
None
:
#miam = get_or_create_node(nodetype='Cvalue', corpus=corpus)
miam_list
=
UnweightedList
(
session
.
query
(
NodeNodeNgram
.
ngram_id
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
miam_id
)
.
all
()
)
if
stop_id
is
not
None
:
#stop = get_or_create_node(nodetype='StopList', corpus=corpus)
stop_list
=
UnweightedList
(
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
stop_id
)
.
all
()
)
if
group_id
is
not
None
:
#group = get_or_create_node(nodetype='GroupList', corpus=corpus)
group_list
=
UnweightedList
(
session
.
query
(
NodeNgramNgram
.
ngramx_id
,
NodeNgramNgram
.
ngramy_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
stop_id
)
.
all
()
)
return
(
query
)
if
miam_id
is
not
None
and
stop_id
is
None
and
group_id
is
None
:
cooc
=
(
matrix
&
miam_list
)
elif
miam_id
is
not
None
and
stop_id
is
not
None
and
group_id
is
None
:
cooc
=
(
matrix
&
miam_list
)
-
stop_list
elif
miam_id
is
not
None
and
stop_id
is
not
None
and
group_id
is
not
None
:
cooc
=
(
matrix
&
miam_list
&
group_list
)
-
stop_list
cooc
.
save
(
node_cooc
.
id
)
return
(
node_cooc
.
id
)
gargantext_web/db.py
View file @
b33f37eb
...
...
@@ -62,6 +62,7 @@ for model_name, model in models.__dict__.items():
NodeNgram
=
Node_Ngram
NodeResource
=
Node_Resource
NodeHyperdata
=
Node_Hyperdata
# manually declare the Node table...
from
datetime
import
datetime
...
...
ngram/specificity.py
View file @
b33f37eb
#from admin.env import *
import
inspect
from
admin.utils
import
PrintException
,
DebugTime
from
django.db
import
connection
,
transaction
from
sqlalchemy
import
desc
,
asc
,
or_
,
and_
,
Date
,
cast
,
select
from
sqlalchemy
import
literal_column
from
sqlalchemy.orm
import
aliased
from
sqlalchemy.sql
import
func
from
gargantext_web.db
import
Node
,
NodeNgram
,
NodeNgramNgram
,
NodeNodeNgram
from
gargantext_web.db
import
session
,
cache
,
get_or_create_node
,
bulk_insert
from
collections
import
defaultdict
import
numpy
as
np
import
pandas
as
pd
from
analysis.lists
import
WeightedMatrix
,
UnweightedList
def
cooc
(
corpus
=
None
,
list_id
=
None
,
limit
=
1000
):
node_cooc
=
get_or_create_node
(
nodetype
=
'Cooccurrence'
,
corpus
=
corpus
,
name_str
=
"Cooccurrences corpus "
+
str
(
corpus
.
id
)
+
"for list Cvalue"
+
str
(
list_id
))
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
node_cooc
.
id
)
.
delete
()
session
.
commit
()
NodeNgramX
=
aliased
(
NodeNgram
)
NodeNgramY
=
aliased
(
NodeNgram
)
doc_id
=
cache
.
NodeType
[
'Document'
]
.
id
#literal_column(str(miam_id)).label("node_id"),
query
=
(
session
.
query
(
NodeNgramX
.
ngram_id
,
NodeNgramY
.
ngram_id
,
func
.
count
())
.
join
(
Node
,
Node
.
id
==
NodeNgramX
.
node_id
)
.
join
(
NodeNgramY
,
NodeNgramY
.
node_id
==
Node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
Node
.
type_id
==
doc_id
)
.
filter
(
NodeNgramX
.
ngram_id
<
NodeNgramY
.
ngram_id
)
.
group_by
(
NodeNgramX
.
ngram_id
,
NodeNgramY
.
ngram_id
)
.
order_by
(
func
.
count
())
.
limit
(
limit
)
)
cvalue_id
=
get_or_create_node
(
nodetype
=
'Cvalue'
,
corpus
=
corpus
)
.
id
stop_id
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
)
.
id
cvalue_list
=
UnweightedList
(
session
.
query
(
NodeNodeNgram
.
ngram_id
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
cvalue_id
)
.
all
())
stop_list
=
UnweightedList
(
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
stop_id
)
.
all
())
matrix
=
WeightedMatrix
(
query
)
from
analysis.cooccurrences
import
cooc
from
gargantext_web.db
import
session
,
cache
,
get_or_create_node
,
bulk_insert
cooc
=
matrix
&
cvalue_list
-
stop_list
cooc
.
save
(
node_cooc
.
id
)
return
(
node_cooc
.
id
)
def
coocOld
(
corpus
=
None
,
list_id
=
None
,
limit
=
100
):
def
specificity
(
cooc_id
=
None
,
corpus
=
None
):
'''
cooc :: Corpus -> Int -> NodeNgramNgram
Compute the specificity, simple calculus.
'''
cursor
=
connection
.
cursor
()
node_cooc
=
get_or_create_node
(
nodetype
=
'Cooccurrence'
,
corpus
=
corpus
,
name_str
=
"Cooccurrences corpus "
+
str
(
corpus
.
id
)
+
"for list Cvalue"
+
str
(
list_id
))
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
node_cooc
.
id
)
.
delete
()
session
.
commit
()
query_cooc
=
"""
INSERT INTO node_nodengramngram (node_id, "ngramx_id", "ngramy_id", score)
SELECT
%
d as node_id,
ngX.id,
ngY.id,
COUNT(*) AS score
FROM
node_node AS n -- the nodes who are direct children of the corpus
INNER JOIN
node_node_ngram AS nngX ON nngX.node_id = n.id -- list of ngrams contained in the node
INNER JOIN
node_nodenodengram AS whitelistX ON whitelistX.ngram_id = nngX.ngram_id -- list of ngrams contained in the whitelist and in the node
INNER JOIN
node_ngram AS ngX ON ngX.id = whitelistX.ngram_id -- ngrams which are in both
INNER JOIN
node_node_ngram AS nngY ON nngY.node_id = n.id
INNER JOIN
node_nodenodengram AS whitelistY ON whitelistY.ngram_id = nngY.ngram_id
INNER JOIN
node_ngram AS ngY ON ngY.id = whitelistY.ngram_id
WHERE
n.parent_id =
%
s
AND
whitelistX.nodex_id =
%
s
AND
whitelistY.nodex_id =
%
s
AND
nngX.ngram_id < nngY.ngram_id -- so we only get distinct pairs of ngrams
GROUP BY
ngX.id,
ngX.terms,
ngY.id,
ngY.terms
ORDER BY
score DESC
LIMIT
%
d
"""
%
(
node_cooc
.
id
,
corpus
.
id
,
list_id
,
list_id
,
limit
)
# print(query_cooc)
cursor
.
execute
(
query_cooc
)
return
(
node_cooc
.
id
)
def
specificity
(
cooc_id
=
None
,
corpus
=
None
):
cooccurrences
=
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
cooc_id
)
.
all
()
matrix
=
defaultdict
(
lambda
:
defaultdict
(
float
))
...
...
@@ -149,20 +46,24 @@ def specificity(cooc_id=None, corpus=None):
bulk_insert
(
NodeNodeNgram
,
[
'nodex_id'
,
'nodey_id'
,
'ngram_id'
,
'score'
],
[
d
for
d
in
data
])
return
(
node
.
id
)
def
compute_specificity
(
corpus
,
limit
=
100
):
'''
Computing specificities
Computing specificities as NodeNodeNgram.
All workflow is the following:
1) Compute the cooc matrix
2) Compute the specificity score, saving it in database, return its Node
'''
dbg
=
DebugTime
(
'Corpus #
%
d - specificity'
%
corpus
.
id
)
list_cvalue
=
get_or_create_node
(
nodetype
=
'Cvalue'
,
corpus
=
corpus
)
cooc_id
=
cooc
(
corpus
=
corpus
,
list
_id
=
list_cvalue
.
id
,
limit
=
limit
)
cooc_id
=
cooc
(
corpus
=
corpus
,
miam
_id
=
list_cvalue
.
id
,
limit
=
limit
)
specificity
(
cooc_id
=
cooc_id
,
corpus
=
corpus
)
dbg
.
show
(
'specificity'
)
#corpus=session.query(Node).filter(Node.id==244250).first()
#cooc2(corpus)
#compute_specificity(corpus)
ngram/workflow.py
View file @
b33f37eb
...
...
@@ -13,8 +13,8 @@ def ngram_workflow(corpus):
'''
compute_tfidf
(
corpus
)
compute_tfidf_global
(
corpus
)
compute_cvalue
(
corpus
,
limit
=
1
000
)
# size
compute_specificity
(
corpus
,
limit
=
8
00
)
compute_cvalue
(
corpus
,
limit
=
3
000
)
# size
compute_specificity
(
corpus
,
limit
=
2
00
)
# compute_stop(corpus)
compute_groups
(
corpus
,
limit_inf
=
400
,
limit_sup
=
600
)
# compute_miam(corpus,limit=100) # size
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment