Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
ae011343
Commit
ae011343
authored
Sep 28, 2016
by
delanoe
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[GRAPH] Graph almost done: needs more factorization.
parent
76617d6b
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
89 additions
and
75 deletions
+89
-75
constants.py
gargantext/constants.py
+1
-1
cooccurrences.py
graph/cooccurrences.py
+65
-55
graph.py
graph/graph.py
+11
-16
rest.py
graph/rest.py
+12
-3
No files found.
gargantext/constants.py
View file @
ae011343
...
...
@@ -392,7 +392,7 @@ DEFAULT_N_DOCS_HAVING_NGRAM = 5
# Graph constraints to compute the graph:
# Modes: live graph generation, graph asynchronously computed or errors detected
# here are the maximum size of corpus and maplist required to compute the graph
graph_constraints
=
{
'corpusMax'
:
5
00
graph_constraints
=
{
'corpusMax'
:
1
00
,
'corpusMin'
:
40
,
'mapList'
:
50
}
graph/cooccurrences.py
View file @
ae011343
from
gargantext.models
import
Node
,
Ngram
,
NodeNgram
,
NodeNgramNgram
,
\
NodeHyperdata
,
HyperdataKey
from
gargantext.util.db
import
session
,
aliased
,
bulk_insert
,
func
from
gargantext.util.db
import
session
,
aliased
,
func
from
gargantext.util.lists
import
WeightedMatrix
,
UnweightedList
,
Translations
from
graph.distances
import
clusterByDistances
...
...
@@ -19,12 +19,28 @@ def filterMatrix(matrix, mapList_id, groupList_id):
cooc
=
matrix
&
(
mapList
*
group_list
)
return
cooc
# computeGraph
def
cooc2graph
(
cooc_id
,
cooc_matrix
,
field1
=
"ngrams"
,
field2
=
"ngrams"
,
distance
=
None
,
bridgeness
=
None
):
@
shared_task
def
computeGraph
(
corpus_id
=
None
,
cooc_id
=
None
,
field1
=
'ngrams'
,
field2
=
'ngrams'
,
start
=
None
,
end
=
None
,
mapList_id
=
None
,
groupList_id
=
None
,
distance
=
None
,
bridgeness
=
None
,
n_min
=
1
,
n_max
=
None
,
limit
=
1000
,
isMonopartite
=
True
,
threshold
=
3
,
save_on_db
=
True
,
reset
=
True
):
print
(
"GRAPH# ... Computing cooccurrences."
)
(
cooc_id
,
cooc_matrix
)
=
countCooccurrences
(
corpus_id
=
corpus_id
,
cooc_id
=
cooc_id
,
field1
=
field1
,
field2
=
field2
,
start
=
start
,
end
=
end
,
mapList_id
=
mapList_id
,
groupList_id
=
groupList_id
,
isMonopartite
=
True
,
threshold
=
threshold
,
distance
=
distance
,
bridgeness
=
bridgeness
,
save_on_db
=
True
)
print
(
"GRAPH#
%
d ... Cooccurrences computed."
%
(
cooc_id
))
print
(
"GRAPH#
%
d ... Computing cooccurrences."
%
(
cooc_id
))
# Check if already computed cooc
# (cooc_id, cooc) = count(countCooccurrences)
print
(
"GRAPH#
%
d ... Clustering with distance
%
s ."
%
(
cooc_id
,
distance
))
G
,
partition
,
ids
,
weight
=
clusterByDistances
(
cooc_matrix
...
...
@@ -50,17 +66,14 @@ def cooc2graph( cooc_id, cooc_matrix, field1="ngrams", field2="ngrams", distance
return
data
@
shared_task
def
countCooccurrences
(
corpus_id
=
None
,
test
=
False
def
countCooccurrences
(
corpus_id
=
None
,
cooc_id
=
None
,
field1
=
'ngrams'
,
field2
=
'ngrams'
,
start
=
None
,
end
=
None
,
mapList_id
=
None
,
groupList_id
=
None
,
distance
=
None
,
bridgeness
=
None
,
n_min
=
1
,
n_max
=
None
,
limit
=
1000
,
coocNode_id
=
None
,
reset
=
True
,
isMonopartite
=
True
,
threshold
=
3
,
distance
=
None
,
bridgeness
=
None
,
save_on_db
=
True
,
# just return the WeightedMatrix,
# (don't write to DB)
,
save_on_db
=
True
,
reset
=
True
):
'''
Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
...
...
@@ -71,15 +84,13 @@ def countCooccurrences( corpus_id=None , test= False
mapList_id :: Int
groupList_id :: Int
For the moment, start and end are simple, only year is implemented yet
start :: TimeStamp -- example: '2010-05-30 02:00:00+02'
end :: TimeStamp
limit :: Int
'''
# TODO : add hyperdata here
#
Parameters to save in hyperdata of the Node Cooc
#
FIXME remove the lines below after factorization of parameters
parameters
=
dict
()
parameters
[
'field1'
]
=
field1
parameters
[
'field2'
]
=
field2
...
...
@@ -88,16 +99,16 @@ def countCooccurrences( corpus_id=None , test= False
corpus
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
corpus_id
)
.
first
()
# Get node of the Graph
if
not
cooc
Node
_id
:
if
not
cooc_id
:
cooc
Node
_id
=
(
session
.
query
(
Node
.
id
)
cooc_id
=
(
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
typename
==
"COOCCURRENCES"
,
Node
.
name
==
"GRAPH EXPLORER"
,
Node
.
parent_id
==
corpus
.
id
)
.
first
()
)
if
not
cooc
Node
_id
:
if
not
cooc_id
:
coocNode
=
corpus
.
add_child
(
typename
=
"COOCCURRENCES"
,
name
=
"GRAPH (in corpus
%
s)"
%
corpus
.
id
...
...
@@ -105,12 +116,12 @@ def countCooccurrences( corpus_id=None , test= False
session
.
add
(
coocNode
)
session
.
commit
()
cooc
Node
_id
=
coocNode
.
id
cooc_id
=
coocNode
.
id
else
:
cooc
Node_id
=
int
(
coocNode
_id
[
0
])
cooc
_id
=
int
(
cooc
_id
[
0
])
if
reset
==
True
:
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
cooc
Node
_id
)
.
delete
()
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
cooc_id
)
.
delete
()
session
.
commit
()
...
...
@@ -191,7 +202,7 @@ def countCooccurrences( corpus_id=None , test= False
# Cooc between the dates start and end
if
start
is
not
None
:
#date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S")
# TODO : more
complexe date format here
.
# TODO : more
precise date format here (day is smaller grain actually)
.
date_start
=
datetime
.
strptime
(
str
(
start
),
"
%
Y-
%
m-
%
d"
)
date_start_utc
=
date_start
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
...
...
@@ -207,7 +218,7 @@ def countCooccurrences( corpus_id=None , test= False
if
end
is
not
None
:
# TODO : more
complexe date format here
.
# TODO : more
precise date format here (day is smaller grain actually)
.
date_end
=
datetime
.
strptime
(
str
(
end
),
"
%
Y-
%
m-
%
d"
)
date_end_utc
=
date_end
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
...
...
@@ -239,28 +250,27 @@ def countCooccurrences( corpus_id=None , test= False
matrix
=
WeightedMatrix
(
cooc_query
)
print
(
"GRAPH #
%
s Filtering the matrix with Map and Group Lists."
%
cooc
Node
_id
)
print
(
"GRAPH #
%
s Filtering the matrix with Map and Group Lists."
%
cooc_id
)
cooc
=
filterMatrix
(
matrix
,
mapList_id
,
groupList_id
)
parameters
[
'MapList_id'
]
=
str
(
mapList_id
)
parameters
[
'GroupList_id'
]
=
str
(
groupList_id
)
# TODO factorize savings on db
if
save_on_db
:
# Saving the cooccurrences
cooc
.
save
(
cooc
Node
_id
)
print
(
"GRAPH#
%
s ... Node Cooccurrence Matrix saved"
%
cooc
Node
_id
)
cooc
.
save
(
cooc_id
)
print
(
"GRAPH#
%
s ... Node Cooccurrence Matrix saved"
%
cooc_id
)
# Saving the parameters
print
(
"GRAPH#
%
s ... Parameters saved in Node."
%
cooc
Node
_id
)
coocNode
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
cooc
Node
_id
)
.
first
()
print
(
"GRAPH#
%
s ... Parameters saved in Node."
%
cooc_id
)
coocNode
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
cooc_id
)
.
first
()
coocNode
.
hyperdata
[
distance
]
=
dict
()
coocNode
.
hyperdata
[
distance
][
"parameters"
]
=
parameters
session
.
add
(
coocNode
)
session
.
commit
()
data
=
cooc2graph
(
coocNode
.
id
,
cooc
,
distance
=
distance
,
bridgeness
=
bridgeness
)
return
data
#
data = cooc2graph(coocNode.id, cooc, distance=distance, bridgeness=bridgeness)
#
return data
else
:
data
=
cooc2graph
(
coocNode_id
,
cooc
,
distance
=
distance
)
return
data
return
(
coocNode
.
id
,
cooc
)
graph/graph.py
View file @
ae011343
...
...
@@ -5,7 +5,7 @@ from gargantext.util.http import JsonHttpResponse
from
gargantext.models
import
Node
,
Ngram
,
NodeNgram
,
NodeNgramNgram
,
NodeHyperdata
#from gargantext.util.toolchain.ngram_coocs import compute_coocs
from
graph.cooccurrences
import
co
untCooccurrences
,
filterMatrix
from
graph.cooccurrences
import
co
mputeGraph
,
filterMatrix
from
graph.distances
import
clusterByDistances
from
graph.bridgeness
import
filterByBridgeness
...
...
@@ -19,12 +19,9 @@ def get_graph( request=None , corpus=None
,
mapList_id
=
None
,
groupList_id
=
None
,
cooc_id
=
None
,
type
=
'node_link'
,
start
=
None
,
end
=
None
,
threshold
=
1
,
distance
=
'conditional'
,
isMonopartite
=
True
# By default, we compute terms/terms graph
,
bridgeness
=
5
,
saveOnly
=
None
#, size=1000
,
distance
=
'conditional'
,
bridgeness
=
5
,
threshold
=
1
,
isMonopartite
=
True
,
saveOnly
=
True
):
'''
Get_graph : main steps:
...
...
@@ -54,7 +51,7 @@ def get_graph( request=None , corpus=None
# Case of graph has been computed already
if
cooc_id
is
not
None
:
print
(
"G
etting data for matrix
%
d"
,
int
(
cooc_id
))
print
(
"G
RAPH#
%
d ... Loading data already computed."
%
int
(
cooc_id
))
node
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
cooc_id
)
.
first
()
# Structure of the Node.hyperdata[distance][bridbeness]
...
...
@@ -65,8 +62,6 @@ def get_graph( request=None , corpus=None
if
node
.
hyperdata
.
get
(
distance
,
None
)
is
not
None
:
graph
=
node
.
hyperdata
[
distance
]
print
(
node
.
hyperdata
[
distance
]
.
keys
())
# Check bridgeness of the graph
if
graph
.
get
(
str
(
bridgeness
),
None
)
is
not
None
:
return
graph
[
str
(
bridgeness
)]
...
...
@@ -133,7 +128,7 @@ def get_graph( request=None , corpus=None
corpus_size
=
corpus_size_query
.
count
()
if
saveOnly
is
not
None
and
saveOnly
==
"True"
:
scheduled
(
co
untCooccurrences
)(
corpus_id
=
corpus
.
id
,
coocNode
_id
=
cooc_id
scheduled
(
co
mputeGraph
)(
corpus_id
=
corpus
.
id
,
cooc
_id
=
cooc_id
#, field1="ngrams", field2="ngrams"
,
start
=
start
,
end
=
end
,
mapList_id
=
mapList_id
,
groupList_id
=
groupList_id
...
...
@@ -144,9 +139,9 @@ def get_graph( request=None , corpus=None
)
return
{
"state"
:
"saveOnly"
}
if
corpus_size
>
graph_constraints
[
'corpusMax'
]:
el
if
corpus_size
>
graph_constraints
[
'corpusMax'
]:
# Then compute cooc asynchronously with celery
scheduled
(
co
untCooccurrences
)(
corpus_id
=
corpus
.
id
,
coocNode
_id
=
cooc_id
scheduled
(
co
mputeGraph
)(
corpus_id
=
corpus
.
id
,
cooc
_id
=
cooc_id
#, field1="ngrams", field2="ngrams"
,
start
=
start
,
end
=
end
,
mapList_id
=
mapList_id
,
groupList_id
=
groupList_id
...
...
@@ -155,8 +150,8 @@ def get_graph( request=None , corpus=None
,
save_on_db
=
True
#, limit=size
)
# Dict to inform user that corpus maximum is reached
then
# graph is computed asynchronously
# Dict to inform user that corpus maximum is reached
#
then
graph is computed asynchronously
return
{
"state"
:
"corpusMax"
,
"length"
:
corpus_size
}
elif
corpus_size
<=
graph_constraints
[
'corpusMin'
]:
...
...
@@ -165,7 +160,7 @@ def get_graph( request=None , corpus=None
else
:
# If graph_constraints are ok then compute the graph in live
data
=
co
untCooccurrences
(
corpus_id
=
corpus
.
id
,
coocNode
_id
=
cooc_id
data
=
co
mputeGraph
(
corpus_id
=
corpus
.
id
,
cooc
_id
=
cooc_id
#, field1="ngrams", field2="ngrams"
,
start
=
start
,
end
=
end
,
mapList_id
=
mapList_id
,
groupList_id
=
groupList_id
...
...
graph/rest.py
View file @
ae011343
#from rest_framework.authentication import SessionAuthentication, BasicAuthentication
from
gargantext.util.db
import
session
from
gargantext.models.nodes
import
Node
from
graph.graph
import
get_graph
...
...
@@ -29,6 +27,16 @@ class Graph(APIView):
# Get the node we are working with
corpus
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
corpus_id
)
.
first
()
# TODO Parameters to save in hyperdata of the Node Cooc
# WARNING: we could factorize the parameters as dict but ...
# ... it causes a bug in asynchronous function !
# Check celery upgrades before.
# Example (for the future):
# parameters = dict()
# parameters['field1'] = field1
# parameters['field2'] = field2
# Get all the parameters in the URL
cooc_id
=
request
.
GET
.
get
(
'cooc_id'
,
None
)
saveOnly
=
request
.
GET
.
get
(
'saveOnly'
,
None
)
...
...
@@ -48,6 +56,7 @@ class Graph(APIView):
type_
=
str
(
request
.
GET
.
get
(
'type'
,
'node_link'
))
distance
=
str
(
request
.
GET
.
get
(
'distance'
,
'conditional'
))
# Get default map List of corpus
if
mapList_id
==
0
:
mapList_id
=
(
session
.
query
(
Node
.
id
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment