Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
cc0cecce
Commit
cc0cecce
authored
Sep 27, 2016
by
delanoe
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[GRAPH] need factorization.
parent
8c0baf85
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
87 additions
and
63 deletions
+87
-63
constants.py
gargantext/constants.py
+1
-1
bridgeness.py
graph/bridgeness.py
+2
-1
cooccurrences.py
graph/cooccurrences.py
+44
-13
distances.py
graph/distances.py
+3
-3
graph.py
graph/graph.py
+29
-41
rest.py
graph/rest.py
+2
-3
views.py
graph/views.py
+6
-1
No files found.
gargantext/constants.py
View file @
cc0cecce
...
...
@@ -392,7 +392,7 @@ DEFAULT_N_DOCS_HAVING_NGRAM = 5
# Graph constraints to compute the graph:
# Modes: live graph generation, graph asynchronously computed or errors detected
# here are the maximum size of corpus and maplist required to compute the graph
graph_constraints
=
{
'corpusMax'
:
599
graph_constraints
=
{
'corpusMax'
:
100
,
'corpusMin'
:
40
,
'mapList'
:
50
}
graph/bridgeness.py
View file @
cc0cecce
...
...
@@ -9,8 +9,9 @@ from networkx.readwrite import json_graph
def
filterByBridgeness
(
G
,
partition
,
ids
,
weight
,
bridgeness
,
type
,
field1
,
field2
):
'''
What is bridgeness ?
Measure to control links (bridges) between communities.
'''
# Data are stored in a dict(), (== hashmap by default
for
Python)
# Data are stored in a dict(), (== hashmap by default
with
Python)
data
=
dict
()
if
type
==
"node_link"
:
nodesB_dict
=
{}
...
...
graph/cooccurrences.py
View file @
cc0cecce
...
...
@@ -3,21 +3,44 @@ from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram, \
from
gargantext.util.db
import
session
,
aliased
,
bulk_insert
,
func
from
gargantext.util.lists
import
WeightedMatrix
,
UnweightedList
,
Translations
from
graph.distances
import
clusterByDistances
from
graph.bridgeness
import
filterByBridgeness
from
sqlalchemy
import
desc
,
asc
,
or_
,
and_
#import inspect
import
datetime
from
datetime
import
datetime
from
celery
import
shared_task
def
filterMatrix
(
matrix
,
mapList_id
,
groupList_id
):
mapList
=
UnweightedList
(
mapList_id
)
mapList
=
UnweightedList
(
mapList_id
)
group_list
=
Translations
(
groupList_id
)
cooc
=
matrix
&
(
mapList
*
group_list
)
return
cooc
def
cooc2graph
(
cooc_id
,
cooc_matrix
,
field1
=
"ngrams"
,
field2
=
"ngrams"
,
distance
=
None
,
bridgeness
=
None
):
print
(
"GRAPH#
%
d ... Clustering with distance
%
s ."
%
(
cooc_id
,
distance
))
G
,
partition
,
ids
,
weight
=
clusterByDistances
(
cooc_matrix
,
field1
=
"ngrams"
,
field2
=
"ngrams"
,
distance
=
distance
)
print
(
"GRAPH#
%
d ... Filtering by bridgeness
%
d."
%
(
cooc_id
,
bridgeness
))
data
=
filterByBridgeness
(
G
,
partition
,
ids
,
weight
,
bridgeness
,
"node_link"
,
field1
,
field2
)
print
(
"GRAPH#
%
d ... Saving Graph in hyperdata as json."
%
cooc_id
)
node
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
cooc_id
)
.
first
()
node
.
hyperdata
[
distance
]
=
dict
()
node
.
hyperdata
[
distance
][
"data"
]
=
data
node
.
save_hyperdata
()
session
.
commit
()
return
data
@
shared_task
def
countCooccurrences
(
corpus_id
=
None
,
test
=
False
,
field1
=
'ngrams'
,
field2
=
'ngrams'
...
...
@@ -26,12 +49,13 @@ def countCooccurrences( corpus_id=None , test= False
,
n_min
=
1
,
n_max
=
None
,
limit
=
1000
,
coocNode_id
=
None
,
reset
=
True
,
isMonopartite
=
True
,
threshold
=
3
,
save_on_db
=
False
,
# just return the WeightedMatrix,
,
distance
=
None
,
bridgeness
=
None
,
save_on_db
=
True
,
# just return the WeightedMatrix,
# (don't write to DB)
):
'''
Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
For the moment list of paramters are not supported because, lists need to
For the moment list of param
e
ters are not supported because, lists need to
be merged before.
corpus :: Corpus
...
...
@@ -162,7 +186,7 @@ def countCooccurrences( corpus_id=None , test= False
if
start
is
not
None
:
#date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S")
# TODO : more complexe date format here.
date_start
=
datetime
.
datetime
.
strptime
(
str
(
start
),
"
%
Y-
%
m-
%
d"
)
date_start
=
datetime
.
strptime
(
str
(
start
),
"
%
Y-
%
m-
%
d"
)
date_start_utc
=
date_start
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
Start
=
aliased
(
NodeHyperdata
)
...
...
@@ -178,7 +202,7 @@ def countCooccurrences( corpus_id=None , test= False
if
end
is
not
None
:
# TODO : more complexe date format here.
date_end
=
datetime
.
datetime
.
strptime
(
str
(
end
),
"
%
Y-
%
m-
%
d"
)
date_end
=
datetime
.
strptime
(
str
(
end
),
"
%
Y-
%
m-
%
d"
)
date_end_utc
=
date_end
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
End
=
aliased
(
NodeHyperdata
)
...
...
@@ -208,22 +232,29 @@ def countCooccurrences( corpus_id=None , test= False
#cooc_query = cooc_query.order_by(desc('cooc_score'))
matrix
=
WeightedMatrix
(
cooc_query
)
print
(
"Node #
%
d Filtering the matrix with Map and Group Lists."
%
coocNode_id
)
cooc
=
filterMatrix
(
matrix
,
mapList_id
,
groupList_id
)
parameters
[
'MapList_id'
]
=
str
(
mapList_id
)
parameters
[
'GroupList_id'
]
=
str
(
ma
pList_id
)
parameters
[
'MapList_id'
]
=
str
(
mapList_id
)
parameters
[
'GroupList_id'
]
=
str
(
grou
pList_id
)
if
save_on_db
:
# Saving cooc Matrix
cooc
.
save
(
coocNode_id
)
print
(
"Node Cooccurrence Matrix saved"
)
# Saving the parameters
print
(
"Saving parameters in Node
%
d"
%
coocNode_id
)
coocNode
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
coocNode_id
)
.
first
()
coocNode
.
hyperdata
=
parameters
coocNode
.
hyperdata
[
distance
]
=
dict
()
coocNode
.
hyperdata
[
distance
][
"parameters"
]
=
parameters
session
.
add
(
coocNode
)
session
.
commit
()
data
=
cooc2graph
(
coocNode
.
id
,
cooc
,
distance
=
distance
,
bridgeness
=
bridgeness
)
print
(
data
)
# Log message
print
(
"Cooccurrence Matrix saved"
)
return
cooc
else
:
data
=
cooc2graph
(
coocNode_id
,
cooc
,
distance
=
distance
)
return
data
graph/distances.py
View file @
cc0cecce
...
...
@@ -16,16 +16,16 @@ import networkx as nx
def
clusterByDistances
(
cooc_matrix
,
field1
=
None
,
field2
=
None
,
distance
=
'conditional'
):
,
distance
=
None
):
'''
do_d
istance :: Coocs[nga, ngb => ccweight] -> (Graph, Partition, {ids}, {weight})
clusterByD
istance :: Coocs[nga, ngb => ccweight] -> (Graph, Partition, {ids}, {weight})
'''
# implicit global session
authorized
=
[
'conditional'
,
'distributional'
,
'cosine'
]
if
distance
not
in
authorized
:
distance
=
'conditional'
raise
ValueError
(
"Distance must be in
%
s"
%
str
(
authorized
))
matrix
=
defaultdict
(
lambda
:
defaultdict
(
float
))
ids
=
defaultdict
(
lambda
:
defaultdict
(
int
))
...
...
graph/graph.py
View file @
cc0cecce
...
...
@@ -51,9 +51,8 @@ def get_graph( request=None , corpus=None
'''
before_cooc
=
datetime
.
now
()
# case of Cooccurrences have not been computed already
if
cooc_id
==
None
:
...
...
@@ -108,8 +107,7 @@ def get_graph( request=None , corpus=None
.
filter
(
End
.
key
==
'publication_date'
)
.
filter
(
End
.
value_utc
<=
date_end_utc
)
)
# Finally test if the size of the corpora is big enough
# --------------------------------
...
...
@@ -121,6 +119,7 @@ def get_graph( request=None , corpus=None
,
start
=
start
,
end
=
end
,
mapList_id
=
mapList_id
,
groupList_id
=
groupList_id
,
isMonopartite
=
True
,
threshold
=
threshold
,
distance
=
distance
,
bridgeness
=
bridgeness
,
save_on_db
=
True
#, limit=size
)
...
...
@@ -133,57 +132,46 @@ def get_graph( request=None , corpus=None
,
start
=
start
,
end
=
end
,
mapList_id
=
mapList_id
,
groupList_id
=
groupList_id
,
isMonopartite
=
True
,
threshold
=
threshold
,
distance
=
distance
,
bridgeness
=
bridgeness
,
save_on_db
=
True
#, limit=size
)
# Dic to inform user that corpus maximum is reached then
# Dic
t
to inform user that corpus maximum is reached then
# graph is computed asynchronously
return
{
"state"
:
"corpusMax"
,
"length"
:
corpus_size
}
elif
corpus_size
<=
graph_constraints
[
'corpusMin'
]:
# Do not compute the graph if corpus is not big enough
return
{
"state"
:
"corpusMin"
,
"length"
:
corpus_size
}
else
:
# If graph_constraints are ok then compute the graph in live
cooc_matrix
=
countCooccurrences
(
corpus_id
=
corpus
.
id
#, field1="ngrams", field2="ngrams"
,
start
=
start
,
end
=
end
,
mapList_id
=
mapList_id
,
groupList_id
=
groupList_id
,
isMonopartite
=
True
,
threshold
=
threshold
,
save_on_db
=
True
#, limit=size
)
else
:
print
(
"Getting data for matrix
%
d"
,
int
(
cooc_id
))
matrix
=
WeightedMatrix
(
int
(
cooc_id
))
#print(matrix)
cooc_matrix
=
filterMatrix
(
matrix
,
mapList_id
,
groupList_id
)
data
=
countCooccurrences
(
corpus_id
=
corpus
.
id
#, field1="ngrams", field2="ngrams"
,
start
=
start
,
end
=
end
,
mapList_id
=
mapList_id
,
groupList_id
=
groupList_id
,
isMonopartite
=
True
,
threshold
=
threshold
,
distance
=
distance
,
bridgeness
=
bridgeness
,
save_on_db
=
True
#, limit=size
)
# fyi
after_cooc
=
datetime
.
now
()
print
(
"... Cooccurrences took
%
f s."
%
(
after_cooc
-
before_cooc
)
.
total_seconds
())
# case when 0 coocs are observed (usually b/c not enough ngrams in maplist)
# case when 0 coocs are observed (usually b/c not enough ngrams in maplist)
if
len
(
cooc_matrix
.
items
)
==
0
:
print
(
"GET_GRAPH: 0 coocs in matrix"
)
data
=
{
'nodes'
:[],
'links'
:[]}
# empty data
if
len
(
data
)
==
0
:
print
(
"GET_GRAPH: 0 coocs in matrix"
)
data
=
{
'nodes'
:[],
'links'
:[]}
# empty data
# normal case
else
:
G
,
partition
,
ids
,
weight
=
clusterByDistances
(
cooc_matrix
,
field1
=
"ngrams"
,
field2
=
"ngrams"
,
distance
=
distance
)
after_cluster
=
datetime
.
now
()
print
(
"... Clustering took
%
f s."
%
(
after_cluster
-
after_cooc
)
.
total_seconds
())
data
=
filterByBridgeness
(
G
,
partition
,
ids
,
weight
,
bridgeness
,
type
,
field1
,
field2
)
after_filter
=
datetime
.
now
()
print
(
"... Filtering took
%
f s."
%
(
after_filter
-
after_cluster
)
.
total_seconds
())
print
(
"Getting data for matrix
%
d"
,
int
(
cooc_id
))
node
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
cooc_id
)
.
first
()
data
=
node
.
hyperdata
[
distance
][
"data"
]
#print(data)
#matrix = WeightedMatrix(int(cooc_id))
#print(matrix)
#cooc_matrix = filterMatrix(matrix, mapList_id, groupList_id)
# normal case
return
data
graph/rest.py
View file @
cc0cecce
...
...
@@ -123,18 +123,17 @@ class Graph(APIView):
groupList_id
=
groupList_id
[
0
]
if
groupList_id
==
None
:
# todo add as an error msg ?
raise
ValueError
(
"GROUPLIST node needed for cooccurrences"
)
#
Check the option
s
#
Declare accepted field
s
accepted_field1
=
[
'ngrams'
,
'journal'
,
'source'
,
'authors'
]
accepted_field2
=
[
'ngrams'
,
]
options
=
[
'start'
,
'end'
,
'threshold'
,
'distance'
,
'cooc_id'
]
try
:
#
Test params
#
Check if parameters are accepted
if
(
field1
in
accepted_field1
)
and
(
field2
in
accepted_field2
):
if
start
is
not
None
and
end
is
not
None
:
data
=
get_graph
(
corpus
=
corpus
,
cooc_id
=
cooc_id
...
...
graph/views.py
View file @
cc0cecce
...
...
@@ -14,6 +14,8 @@ def explorer(request, project_id, corpus_id):
Graph explorer, also known as TinaWebJS, using SigmaJS.
Nodes are ngrams (from title or abstract or journal name.
Links represent proximity measure.
Data are received in RESTfull mode (see rest.py).
'''
# we pass our corpus
...
...
@@ -46,7 +48,10 @@ def explorer(request, project_id, corpus_id):
@
requires_auth
def
myGraphs
(
request
,
project_id
,
corpus_id
):
'''
List all of my Graphs
List all of my Graphs.
Each Graphs as one Node of Cooccurrences.
Each Graph is save in hyperdata of each Node.
'''
user
=
cache
.
User
[
request
.
user
.
id
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment