Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
ce761204
Commit
ce761204
authored
Oct 31, 2017
by
sim
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Remove graph module
parent
6a0506a4
Changes
18
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
1 addition
and
2667 deletions
+1
-2667
README.md
gargantext/graph/README.md
+0
-91
__init__.py
gargantext/graph/__init__.py
+0
-0
bridgeness.py
gargantext/graph/bridgeness.py
+0
-123
cooccurrences.py
gargantext/graph/cooccurrences.py
+0
-236
distances.py
gargantext/graph/distances.py
+0
-220
graph.py
gargantext/graph/graph.py
+0
-274
growth.py
gargantext/graph/growth.py
+0
-61
intersection.py
gargantext/graph/intersection.py
+0
-103
louvain.py
gargantext/graph/louvain.py
+0
-554
mail_notification.py
gargantext/graph/mail_notification.py
+0
-34
rest.py
gargantext/graph/rest.py
+0
-227
explorer.html
gargantext/graph/templates/explorer.html
+0
-546
urls.py
gargantext/graph/urls.py
+0
-20
utils.py
gargantext/graph/utils.py
+0
-49
views.py
gargantext/graph/views.py
+0
-111
settings.py
gargantext/settings.py
+0
-2
urls.py
gargantext/urls.py
+0
-7
urls.py
gargantext/views/api/urls.py
+1
-9
No files found.
gargantext/graph/README.md
deleted
100644 → 0
View file @
6a0506a4
Module Graph Explorer: from text to graph
=========================================
## Graph Explorer main
0) All urls.py of the Graph Explorer
1) Main view of the graph explorer: views.py
-> Graph Explorer
-> My graph View
-> REST API to get Data
2) Graph is generated (graph.py) through different steps
a) check the constraints (graph_constraints) in gargantext/constants.py
b) Data are retrieved as REST
rest.py: check REST parameters
c) graph.py:
get_graph: check Graph parameters
compute_graph: compute graph
1) Cooccurences are computed (in live or asynchronously): cooccurrences.py
2) Thresold and distances : distances.py
3) clustering: louvain.py
4) links between communities: bridgeness.py
d) compress graph before returning it: utils.py
4) Additional features:
a) intersection of graphs: intersection.py
## How to contribute ?
Some solutions:
1) please report to dev@gargantext.org
2) fix with git repo and pull request
## TODO
myGraphs view:
*
progress bar
*
Show already computed graphs vs to be computed with parameters
*
show parameters
*
copy / paste and change some parameters to generate new graph
gargantext/graph/__init__.py
deleted
100644 → 0
View file @
6a0506a4
gargantext/graph/bridgeness.py
deleted
100644 → 0
View file @
6a0506a4
# Article coming soon
from
gargantext.util.db
import
session
from
gargantext.models.ngrams
import
Ngram
from
collections
import
defaultdict
from
networkx.readwrite
import
json_graph
def
filterByBridgeness
(
G
,
partition
,
ids
,
weight
,
bridgeness
,
type
,
field1
,
field2
):
'''
Bridgeness = measure to control links (bridges) between communities.
'''
# Data are stored in a dict(), (== hashmap by default with Python)
data
=
dict
()
if
type
==
"node_link"
:
nodesB_dict
=
{}
for
node_id
in
G
.
nodes
():
#node,type(labels[node])
nodesB_dict
[
ids
[
node_id
][
1
]
]
=
True
# TODO the query below is not optimized (do it do_distance).
the_label
=
session
.
query
(
Ngram
.
terms
)
.
filter
(
Ngram
.
id
==
node_id
)
.
first
()
the_label
=
", "
.
join
(
the_label
)
G
.
node
[
node_id
][
'label'
]
=
the_label
G
.
node
[
node_id
][
'size'
]
=
weight
[
node_id
]
G
.
node
[
node_id
][
'type'
]
=
ids
[
node_id
][
0
]
.
replace
(
"ngrams"
,
"terms"
)
G
.
node
[
node_id
][
'attributes'
]
=
{
"clust_default"
:
partition
[
node_id
]}
# new format
# G.add_edge(node, "cluster " + str(partition[node]), weight=3)
links
=
[]
i
=
1
if
bridgeness
>
0
:
com_link
=
defaultdict
(
lambda
:
defaultdict
(
list
))
com_ids
=
defaultdict
(
list
)
for
k
,
v
in
partition
.
items
():
com_ids
[
v
]
.
append
(
k
)
for
e
in
G
.
edges_iter
():
s
=
e
[
0
]
t
=
e
[
1
]
weight
=
G
[
ids
[
s
][
1
]][
ids
[
t
][
1
]][
"weight"
]
if
bridgeness
<
0
:
info
=
{
"s"
:
ids
[
s
][
1
]
,
"t"
:
ids
[
t
][
1
]
,
"w"
:
weight
}
links
.
append
(
info
)
else
:
if
partition
[
s
]
==
partition
[
t
]:
info
=
{
"s"
:
ids
[
s
][
1
]
,
"t"
:
ids
[
t
][
1
]
,
"w"
:
weight
}
links
.
append
(
info
)
if
bridgeness
>
0
:
if
partition
[
s
]
<
partition
[
t
]:
com_link
[
partition
[
s
]][
partition
[
t
]]
.
append
((
s
,
t
,
weight
))
if
bridgeness
>
0
:
for
c1
in
com_link
.
keys
():
for
c2
in
com_link
[
c1
]
.
keys
():
index
=
round
(
bridgeness
*
len
(
com_link
[
c1
][
c2
]
)
/
#----------------------------------#
(
len
(
com_ids
[
c1
])
+
len
(
com_ids
[
c2
]
))
)
#print((c1,len(com_ids[c1])), (c2,len(com_ids[c2])), index)
if
index
>
0
:
for
link
in
sorted
(
com_link
[
c1
][
c2
]
,
key
=
lambda
x
:
x
[
2
]
,
reverse
=
True
)[:
index
]:
#print(c1, c2, link[2])
info
=
{
"s"
:
link
[
0
],
"t"
:
link
[
1
],
"w"
:
link
[
2
]}
links
.
append
(
info
)
B
=
json_graph
.
node_link_data
(
G
)
B
[
"links"
]
=
[]
B
[
"links"
]
=
links
if
field1
==
field2
==
'ngrams'
:
data
[
"nodes"
]
=
B
[
"nodes"
]
data
[
"links"
]
=
B
[
"links"
]
else
:
A
=
get_graphA
(
"journal"
,
nodesB_dict
,
B
[
"links"
]
,
corpus
)
print
(
"#nodesA:"
,
len
(
A
[
"nodes"
]))
print
(
"#linksAA + #linksAB:"
,
len
(
A
[
"links"
]))
print
(
"#nodesB:"
,
len
(
B
[
"nodes"
]))
print
(
"#linksBB:"
,
len
(
B
[
"links"
]))
data
[
"nodes"
]
=
A
[
"nodes"
]
+
B
[
"nodes"
]
data
[
"links"
]
=
A
[
"links"
]
+
B
[
"links"
]
print
(
" total nodes :"
,
len
(
data
[
"nodes"
]))
print
(
" total links :"
,
len
(
data
[
"links"
]))
print
(
""
)
elif
type
==
"adjacency"
:
for
node
in
G
.
nodes
():
try
:
#node,type(labels[node])
#G.node[node]['label'] = node
G
.
node
[
node
][
'name'
]
=
node
#G.node[node]['size'] = weight[node]
G
.
node
[
node
][
'group'
]
=
partition
[
node
]
#G.add_edge(node, partition[node], weight=3)
except
Exception
as
error
:
print
(
"error02: "
,
error
)
data
=
json_graph
.
node_link_data
(
G
)
elif
type
==
'bestpartition'
:
return
(
partition
)
return
(
data
)
gargantext/graph/cooccurrences.py
deleted
100644 → 0
View file @
6a0506a4
from
gargantext.models
import
Node
,
Ngram
,
NodeNgram
,
NodeNgramNgram
,
\
NodeHyperdata
,
HyperdataKey
from
gargantext.util.db
import
session
,
aliased
,
func
from
gargantext.util.lists
import
WeightedMatrix
,
UnweightedList
,
Translations
from
sqlalchemy
import
desc
,
asc
,
or_
,
and_
from
datetime
import
datetime
def
filterMatrix
(
matrix
,
mapList_id
,
groupList_id
):
mapList
=
UnweightedList
(
mapList_id
)
group_list
=
Translations
(
groupList_id
)
cooc
=
matrix
&
(
mapList
*
group_list
)
return
cooc
def
countCooccurrences
(
corpus_id
=
None
,
cooc_id
=
None
,
field1
=
'ngrams'
,
field2
=
'ngrams'
,
start
=
None
,
end
=
None
,
mapList_id
=
None
,
groupList_id
=
None
,
distance
=
None
,
bridgeness
=
None
,
n_min
=
1
,
n_max
=
None
,
limit
=
1000
,
isMonopartite
=
True
,
threshold
=
3
,
save_on_db
=
True
,
reset
=
True
):
'''
Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
For the moment list of parameters are not supported because, lists need to
be merged before.
corpus :: Corpus
mapList_id :: Int
groupList_id :: Int
start :: TimeStamp -- example: '2010-05-30 02:00:00+02'
end :: TimeStamp
limit :: Int
'''
# FIXME remove the lines below after factorization of parameters
parameters
=
dict
()
parameters
[
'field1'
]
=
field1
parameters
[
'field2'
]
=
field2
# Get corpus as Python object
corpus
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
corpus_id
)
.
first
()
# Get node of the Graph
if
not
cooc_id
:
cooc_id
=
(
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
typename
==
"COOCCURRENCES"
,
Node
.
name
==
"GRAPH EXPLORER"
,
Node
.
parent_id
==
corpus
.
id
)
.
first
()
)
if
not
cooc_id
:
coocNode
=
corpus
.
add_child
(
typename
=
"COOCCURRENCES"
,
name
=
"GRAPH (in corpus
%
s)"
%
corpus
.
id
)
session
.
add
(
coocNode
)
session
.
commit
()
cooc_id
=
coocNode
.
id
else
:
cooc_id
=
int
(
cooc_id
[
0
])
# when cooc_id preexisted, but we want to continue (reset = True)
# (to give new contents to this cooc_id)
elif
reset
:
print
(
"GRAPH #
%
s ... Counting new cooccurrences data."
%
cooc_id
)
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
cooc_id
)
.
delete
()
session
.
commit
()
# when cooc_id preexisted and we just want to load it (reset = False)
else
:
print
(
"GRAPH #
%
s ... Loading cooccurrences computed already."
%
cooc_id
)
cooc
=
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
,
NodeNgramNgram
.
weight
)
.
filter
(
NodeNgramNgram
.
node_id
==
cooc_id
)
.
all
()
return
(
int
(
cooc_id
),
WeightedMatrix
(
cooc
))
NodeNgramX
=
aliased
(
NodeNgram
)
# Simple Cooccurrences
cooc_score
=
func
.
count
(
NodeNgramX
.
node_id
)
.
label
(
'cooc_score'
)
# A kind of Euclidean distance cooccurrences
#cooc_score = func.sqrt(func.sum(NodeNgramX.weight * NodeNgramY.weight)).label('cooc_score')
if
isMonopartite
:
NodeNgramY
=
aliased
(
NodeNgram
)
cooc_query
=
(
session
.
query
(
NodeNgramX
.
ngram_id
,
NodeNgramY
.
ngram_id
,
cooc_score
)
.
join
(
Node
,
Node
.
id
==
NodeNgramX
.
node_id
)
.
join
(
NodeNgramY
,
NodeNgramY
.
node_id
==
Node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
Node
.
typename
==
"DOCUMENT"
)
)
else
:
NodeNgramY
=
aliased
(
NodeNgram
)
cooc_query
=
(
session
.
query
(
NodeHyperdataNgram
.
ngram_id
,
NodeNgramY
.
ngram_id
,
cooc_score
)
.
join
(
Node
,
Node
.
id
==
NodeHyperdataNgram
.
node_id
)
.
join
(
NodeNgramY
,
NodeNgramY
.
node_id
==
Node
.
id
)
.
join
(
Hyperdata
,
Hyperdata
.
id
==
NodeHyperdataNgram
.
hyperdata_id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
Node
.
typename
==
"DOCUMENT"
)
.
filter
(
Hyperdata
.
name
==
field1
)
)
# Size of the ngrams between n_min and n_max
if
n_min
is
not
None
or
n_max
is
not
None
:
if
isMonopartite
:
NgramX
=
aliased
(
Ngram
)
cooc_query
=
cooc_query
.
join
(
NgramX
,
NgramX
.
id
==
NodeNgramX
.
ngram_id
)
NgramY
=
aliased
(
Ngram
)
cooc_query
=
cooc_query
.
join
(
NgramY
,
NgramY
.
id
==
NodeNgramY
.
ngram_id
)
if
n_min
is
not
None
:
cooc_query
=
(
cooc_query
.
filter
(
NgramY
.
n
>=
n_min
)
)
if
isMonopartite
:
cooc_query
=
cooc_query
.
filter
(
NgramX
.
n
>=
n_min
)
if
n_max
is
not
None
:
cooc_query
=
(
cooc_query
.
filter
(
NgramY
.
n
>=
n_min
)
)
if
isMonopartite
:
cooc_query
=
cooc_query
.
filter
(
NgramX
.
n
>=
n_min
)
# Cooc between the dates start and end
if
start
is
not
None
:
#date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S")
# TODO : more precise date format here (day is smaller grain actually).
date_start
=
datetime
.
strptime
(
str
(
start
),
"
%
Y-
%
m-
%
d"
)
date_start_utc
=
date_start
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
Start
=
aliased
(
NodeHyperdata
)
cooc_query
=
(
cooc_query
.
join
(
Start
,
Start
.
node_id
==
Node
.
id
)
.
filter
(
Start
.
key
==
'publication_date'
)
.
filter
(
Start
.
value_utc
>=
date_start_utc
)
)
parameters
[
'start'
]
=
date_start_utc
if
end
is
not
None
:
# TODO : more precise date format here (day is smaller grain actually).
date_end
=
datetime
.
strptime
(
str
(
end
),
"
%
Y-
%
m-
%
d"
)
date_end_utc
=
date_end
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
End
=
aliased
(
NodeHyperdata
)
cooc_query
=
(
cooc_query
.
join
(
End
,
End
.
node_id
==
Node
.
id
)
.
filter
(
End
.
key
==
'publication_date'
)
.
filter
(
End
.
value_utc
<=
date_end_utc
)
)
parameters
[
'end'
]
=
date_end_utc
if
isMonopartite
:
# Cooc is symetric, take only the main cooccurrences and cut at the limit
cooc_query
=
cooc_query
.
filter
(
NodeNgramX
.
ngram_id
<
NodeNgramY
.
ngram_id
)
cooc_query
=
cooc_query
.
having
(
cooc_score
>=
threshold
)
if
isMonopartite
:
cooc_query
=
cooc_query
.
group_by
(
NodeNgramX
.
ngram_id
,
NodeNgramY
.
ngram_id
)
else
:
cooc_query
=
cooc_query
.
group_by
(
NodeHyperdataNgram
.
ngram_id
,
NodeNgramY
.
ngram_id
)
# Order according some scores
# If ordering is really needed, use Ordered Index (faster)
#cooc_query = cooc_query.order_by(desc('cooc_score'))
matrix
=
WeightedMatrix
(
cooc_query
)
print
(
"GRAPH #
%
s Filtering the matrix with Map and Group Lists."
%
cooc_id
)
cooc
=
filterMatrix
(
matrix
,
mapList_id
,
groupList_id
)
parameters
[
'MapList_id'
]
=
str
(
mapList_id
)
parameters
[
'GroupList_id'
]
=
str
(
groupList_id
)
# TODO factorize savings on db
if
save_on_db
:
# Saving the cooccurrences
cooc
.
save
(
cooc_id
)
print
(
"GRAPH #
%
s ... Node Cooccurrence Matrix saved"
%
cooc_id
)
# Saving the parameters
print
(
"GRAPH #
%
s ... Parameters saved in Node."
%
cooc_id
)
coocNode
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
cooc_id
)
.
first
()
coocNode
.
hyperdata
[
"parameters"
]
=
dict
()
coocNode
.
hyperdata
[
"parameters"
]
=
parameters
coocNode
.
save_hyperdata
()
session
.
commit
()
#data = cooc2graph(coocNode.id, cooc, distance=distance, bridgeness=bridgeness)
else
:
return
cooc
return
(
coocNode
.
id
,
cooc
)
gargantext/graph/distances.py
deleted
100644 → 0
View file @
6a0506a4
import
math
import
numpy
as
np
import
pandas
as
pd
import
networkx
as
nx
from
copy
import
copy
from
collections
import
defaultdict
from
math
import
log
,
sqrt
#from operator import itemgetter
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNgramNgram
,
\
NodeHyperdata
from
gargantext.util.db
import
session
,
aliased
from
.louvain
import
best_partition
def
clusterByDistances
(
cooc_matrix
,
field1
=
None
,
field2
=
None
,
distance
=
None
):
'''
clusterByDistance :: Coocs[nga, ngb => ccweight] -> (Graph, Partition, {ids}, {weight})
'''
# implicit global session
authorized
=
[
'conditional'
,
'distributional'
,
'cosine'
]
if
distance
not
in
authorized
:
raise
ValueError
(
"Distance must be in
%
s"
%
str
(
authorized
))
matrix
=
defaultdict
(
lambda
:
defaultdict
(
float
))
ids
=
defaultdict
(
lambda
:
defaultdict
(
int
))
labels
=
dict
()
weight
=
dict
()
for
cooc
in
cooc_matrix
.
items
:
ngram1_id
=
cooc
[
0
]
ngram2_id
=
cooc
[
1
]
ccweight
=
cooc_matrix
.
items
[
cooc
]
matrix
[
ngram1_id
][
ngram2_id
]
=
ccweight
matrix
[
ngram2_id
][
ngram1_id
]
=
ccweight
ids
[
ngram1_id
]
=
(
field1
,
ngram1_id
)
ids
[
ngram2_id
]
=
(
field2
,
ngram2_id
)
weight
[
ngram1_id
]
=
weight
.
get
(
ngram1_id
,
0
)
+
ccweight
weight
[
ngram2_id
]
=
weight
.
get
(
ngram2_id
,
0
)
+
ccweight
x
=
pd
.
DataFrame
(
matrix
)
.
fillna
(
0
)
if
distance
==
'conditional'
:
x
=
x
/
x
.
sum
(
axis
=
1
)
#y = y / y.sum(axis=0)
xs
=
x
.
sum
(
axis
=
1
)
-
x
ys
=
x
.
sum
(
axis
=
0
)
-
x
# top inclus ou exclus
n
=
(
xs
+
ys
)
/
(
2
*
(
x
.
shape
[
0
]
-
1
))
# top generic or specific
m
=
(
xs
-
ys
)
/
(
2
*
(
x
.
shape
[
0
]
-
1
))
n
=
n
.
sort_index
(
inplace
=
False
)
m
=
m
.
sort_index
(
inplace
=
False
)
nodes_included
=
10000
#int(round(size/20,0))
#nodes_excluded = int(round(size/10,0))
nodes_specific
=
10000
#int(round(size/10,0))
#nodes_generic = int(round(size/10,0))
# TODO use the included score for the node size
n_index
=
pd
.
Index
.
intersection
(
x
.
index
,
n
.
index
[:
nodes_included
])
# Generic:
#m_index = pd.Index.intersection(x.index, m.index[:nodes_generic])
# Specific:
m_index
=
pd
.
Index
.
intersection
(
x
.
index
,
m
.
index
[
-
nodes_specific
:])
#m_index = pd.Index.intersection(x.index, n.index[:nodes_included])
x_index
=
pd
.
Index
.
union
(
n_index
,
m_index
)
xx
=
x
[
list
(
x_index
)]
.
T
[
list
(
x_index
)]
# Removing unconnected nodes
xxx
=
xx
.
values
threshold
=
min
(
xxx
.
max
(
axis
=
1
))
matrix_filtered
=
np
.
where
(
xxx
>=
threshold
,
xxx
,
0
)
#matrix_filtered = matrix_filtered.resize((90,90))
G
=
nx
.
from_numpy_matrix
(
np
.
matrix
(
matrix_filtered
))
G
=
nx
.
relabel_nodes
(
G
,
dict
(
enumerate
([
ids
[
id_
][
1
]
for
id_
in
list
(
xx
.
columns
)])))
elif
distance
==
'cosine'
:
scd
=
defaultdict
(
lambda
:
defaultdict
(
int
))
for
i
in
matrix
.
keys
():
for
j
in
matrix
.
keys
():
numerator
=
sum
(
[
matrix
[
i
][
k
]
*
matrix
[
j
][
k
]
for
k
in
matrix
.
keys
()
if
i
!=
j
and
k
!=
i
and
k
!=
j
]
)
denominator
=
sqrt
(
sum
([
matrix
[
i
][
k
]
for
k
in
matrix
.
keys
()
if
k
!=
i
and
k
!=
j
#and matrix[i][k] > 0
])
*
sum
([
matrix
[
i
][
k
]
for
k
in
matrix
.
keys
()
if
k
!=
i
and
k
!=
j
#and matrix[i][k] > 0
])
)
try
:
scd
[
i
][
j
]
=
numerator
/
denominator
except
Exception
as
error
:
scd
[
i
][
j
]
=
0
minmax
=
min
([
max
([
scd
[
i
][
j
]
for
i
in
scd
.
keys
()])
for
j
in
scd
.
keys
()])
G
=
nx
.
DiGraph
()
G
.
add_edges_from
(
[
(
i
,
j
,
{
'weight'
:
scd
[
i
][
j
]})
for
i
in
scd
.
keys
()
for
j
in
scd
.
keys
()
if
i
!=
j
and
scd
[
i
][
j
]
>
minmax
and
scd
[
i
][
j
]
>
scd
[
j
][
i
]
]
)
elif
distance
==
'distributional'
:
mi
=
defaultdict
(
lambda
:
defaultdict
(
int
))
total_cooc
=
x
.
sum
()
.
sum
()
for
i
in
matrix
.
keys
():
si
=
sum
([
matrix
[
i
][
j
]
for
j
in
matrix
[
i
]
.
keys
()
if
i
!=
j
])
for
j
in
matrix
[
i
]
.
keys
():
sj
=
sum
([
matrix
[
j
][
k
]
for
k
in
matrix
[
j
]
.
keys
()
if
j
!=
k
])
if
i
!=
j
:
mi
[
i
][
j
]
=
log
(
matrix
[
i
][
j
]
/
((
si
*
sj
)
/
total_cooc
)
)
r
=
defaultdict
(
lambda
:
defaultdict
(
int
))
for
i
in
matrix
.
keys
():
for
j
in
matrix
.
keys
():
sumMin
=
sum
(
[
min
(
mi
[
i
][
k
],
mi
[
j
][
k
])
for
k
in
matrix
.
keys
()
if
i
!=
j
and
k
!=
i
and
k
!=
j
and
mi
[
i
][
k
]
>
0
]
)
sumMi
=
sum
(
[
mi
[
i
][
k
]
for
k
in
matrix
.
keys
()
if
k
!=
i
and
k
!=
j
and
mi
[
i
][
k
]
>
0
]
)
try
:
r
[
i
][
j
]
=
sumMin
/
sumMi
except
Exception
as
error
:
r
[
i
][
j
]
=
0
# Need to filter the weak links, automatic threshold here
minmax
=
min
([
max
([
r
[
i
][
j
]
for
i
in
r
.
keys
()])
for
j
in
r
.
keys
()])
G
=
nx
.
DiGraph
()
G
.
add_edges_from
(
[
(
i
,
j
,
{
'weight'
:
r
[
i
][
j
]})
for
i
in
r
.
keys
()
for
j
in
r
.
keys
()
if
i
!=
j
and
r
[
i
][
j
]
>
minmax
and
r
[
i
][
j
]
>
r
[
j
][
i
]
]
)
# degree_max = max([(n, d) for n,d in G.degree().items()], key=itemgetter(1))[1]
# nodes_to_remove = [n for (n,d) in G.degree().items() if d <= round(degree_max/2)]
# G.remove_nodes_from(nodes_to_remove)
# Removing too connected nodes (find automatic way to do it)
#edges_to_remove = [ e for e in G.edges_iter() if
# nodes_to_remove = [n for n in degree if degree[n] <= 1]
# G.remove_nodes_from(nodes_to_remove)
def
getWeight
(
item
):
return
item
[
1
]
#
# node_degree = sorted(G.degree().items(), key=getWeight, reverse=True)
# #print(node_degree)
# nodes_too_connected = [n[0] for n in node_degree[0:(round(len(node_degree)/5))]]
#
# for n in nodes_too_connected:
# n_edges = list()
# for v in nx.neighbors(G,n):
# #print((n, v), G[n][v]['weight'], ":", (v,n), G[v][n]['weight'])
# n_edges.append(((n, v), G[n][v]['weight']))
#
# n_edges_sorted = sorted(n_edges, key=getWeight, reverse=True)
# #G.remove_edges_from([ e[0] for e in n_edges_sorted[round(len(n_edges_sorted)/2):]])
# #G.remove_edges_from([ e[0] for e in n_edges_sorted[(round(len(nx.neighbors(G,n))/3)):]])
# G.remove_edges_from([ e[0] for e in n_edges_sorted[10:]])
G
.
remove_nodes_from
(
nx
.
isolates
(
G
))
partition
=
best_partition
(
G
.
to_undirected
())
return
(
G
,
partition
,
ids
,
weight
)
gargantext/graph/graph.py
deleted
100644 → 0
View file @
6a0506a4
This diff is collapsed.
Click to expand it.
gargantext/graph/growth.py
deleted
100644 → 0
View file @
6a0506a4
"""
Computes ngram growth on periods
"""
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNodeNgram
,
NodeNgramNgram
from
gargantext.util.db_cache
import
cache
from
gargantext.util.db
import
session
,
bulk_insert
,
aliased
,
\
func
,
get_engine
# = sqlalchemy.func like sum() or count()
from
datetime
import
datetime
def
timeframes
(
start
,
end
):
"""
timeframes :: String -> String -> (UTCTime, UTCTime, UTCTime)
"""
start
=
datetime
.
strptime
(
str
(
start
),
"
%
Y-
%
m-
%
d"
)
end
=
datetime
.
strptime
(
str
(
end
),
"
%
Y-
%
m-
%
d"
)
date_0
=
start
-
(
end
-
start
)
date_1
=
start
date_2
=
end
return
(
date_0
,
date_1
,
date_2
)
def
compute_growth
(
corpus_id
,
groupList_id
,
mapList_id
,
start
,
end
):
"""
compute_graph :: Int -> UTCTime -> UTCTime -> Int -> Int
-> [(Int, Numeric)]
this function uses SQL function in
/srv/gargantext/install/gargamelle/sqlFunctions.sql
First compute occurrences of ngrams in mapList (with groups) on the first
period, then on the second and finally returns growth.
Directly computed with Postgres Database (C) for optimization.
"""
connection
=
get_engine
()
(
date_0
,
date_1
,
date_2
)
=
timeframes
(
start
,
end
)
query
=
"""SELECT * FROM OCC_HIST( {corpus_id}
, {groupList_id}
, {mapList_id}
, '{date_0}'
, '{date_1}'
, '{date_2}'
)
"""
.
format
(
corpus_id
=
corpus_id
,
groupList_id
=
groupList_id
,
mapList_id
=
mapList_id
,
date_0
=
date_0
,
date_1
=
date_1
,
date_2
=
date_2
)
return
(
connection
.
execute
(
query
))
gargantext/graph/intersection.py
deleted
100644 → 0
View file @
6a0506a4
from
gargantext.models
import
Node
,
Ngram
,
NodeNgram
,
NodeNgramNgram
,
\
HyperdataKey
from
gargantext.util.db
import
session
,
aliased
,
bulk_insert
,
func
from
gargantext.util.lists
import
WeightedMatrix
,
UnweightedList
,
Translations
from
gargantext.util.http
import
JsonHttpResponse
from
sqlalchemy
import
desc
,
asc
,
or_
,
and_
,
func
import
datetime
import
ast
import
networkx
as
nx
def
doc_freq
(
corpus_id
,
node_ids
):
'''
doc_freq :: Corpus_id -> [(Ngram_id, Int)]
Given a corpus, compute number of documents that have the ngram in it.
'''
return
(
session
.
query
(
NodeNgram
.
ngram_id
,
func
.
count
(
NodeNgram
.
node_id
))
.
join
(
Node
,
NodeNgram
.
node_id
==
Node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus_id
,
Node
.
typename
==
'DOCUMENT'
)
.
filter
(
NodeNgram
.
weight
>
0
,
NodeNgram
.
ngram_id
.
in_
(
node_ids
)
)
.
group_by
(
NodeNgram
.
ngram_id
)
.
all
()
)
def
doc_ngram_representativity
(
corpus_id
,
node_ids
):
'''
doc_ngram_representativity :: Corpus_ID -> Dict Ngram_id Float
Given a corpus, compute part of of documents that have the ngram it it.
'''
nodes_count
=
(
session
.
query
(
Node
)
.
filter
(
Node
.
parent_id
==
corpus_id
,
Node
.
typename
==
'DOCUMENT'
)
.
count
()
)
result
=
dict
()
for
ngram_id
,
somme
in
doc_freq
(
corpus_id
,
node_ids
):
result
[
ngram_id
]
=
somme
/
nodes_count
return
result
def
compare_corpora
(
Corpus_id_A
,
Corpus_id_B
,
node_ids
):
'''
compare_corpora :: Corpus_id -> Corpus_id -> Dict Ngram_id Float
Given two corpus :
- if corpora are the same, it return :
(dict of document frequency per ngram as key)
- if corpora are different, it returns :
doc_ngram_representativit(Corpus_id_A) / doc_ngram_representativity(Corpus_id_B)
(as dict per ngram as key)
'''
result
=
dict
()
if
int
(
Corpus_id_A
)
==
int
(
Corpus_id_B
):
for
ngram_id
,
somme
in
doc_freq
(
Corpus_id_A
,
node_ids
):
result
[
ngram_id
]
=
somme
else
:
data_A
=
doc_ngram_representativity
(
Corpus_id_A
,
node_ids
)
data_B
=
doc_ngram_representativity
(
Corpus_id_B
,
node_ids
)
queue
=
list
()
for
k
in
data_A
.
keys
():
if
k
not
in
data_B
.
keys
():
queue
.
append
(
k
)
else
:
result
[
k
]
=
data_B
[
k
]
/
data_A
[
k
]
maximum
=
max
([
result
[
k
]
for
k
in
result
.
keys
()])
minimum
=
min
([
result
[
k
]
for
k
in
result
.
keys
()])
for
k
in
queue
:
result
[
k
]
=
minimum
return
result
def
intersection
(
request
,
corpuses_ids
,
measure
=
'cooc'
):
'''
intersection :: (str(Int) + "a" str(Int)) -> Dict(Ngram.id :: Int, Score :: Int)
intersection = returns as Json Http Response the intersection of two graphs
'''
if
request
.
method
==
'POST'
and
"nodeids"
in
request
.
POST
and
len
(
request
.
POST
[
"nodeids"
])
>
0
:
node_ids
=
[
int
(
i
)
for
i
in
(
ast
.
literal_eval
(
request
.
POST
[
"nodeids"
]
))
]
# Here are the visible nodes of the initial semantic map.
corpuses_ids
=
corpuses_ids
.
split
(
'a'
)
corpuses_ids
=
[
int
(
i
)
for
i
in
corpuses_ids
]
# corpus[1] will be the corpus to compare
return
JsonHttpResponse
(
compare_corpora
(
corpuses_ids
[
0
],
corpuses_ids
[
1
],
node_ids
))
gargantext/graph/louvain.py
deleted
100644 → 0
View file @
6a0506a4
This diff is collapsed.
Click to expand it.
gargantext/graph/mail_notification.py
deleted
100644 → 0
View file @
6a0506a4
from
gargantext.models.users
import
User
from
gargantext.util.db
import
session
from
django.core.mail
import
send_mail
from
gargantext.settings
import
BASE_URL
def
notify_owner
(
corpus
,
cooc_id
,
distance
,
bridgeness
):
user
=
session
.
query
(
User
)
.
filter
(
User
.
id
==
corpus
.
user_id
)
.
first
()
message
=
'''
Bonjour,
votre graph vient de se terminer dans votre corpus intitulé:
%
s
Vous pouvez accéder et renommer votre Graph à l'adresse:
http://
%
s/projects/
%
d/corpora/
%
d/explorer?cooc_id=
%
d&distance=
%
s&bridgeness=
%
d
Nous restons à votre disposition pour tout complément d'information.
Cordialement
--
L'équipe de Gargantext (CNRS)
'''
%
(
corpus
.
name
,
BASE_URL
,
corpus
.
parent_id
,
corpus
.
id
,
cooc_id
,
distance
,
bridgeness
)
if
user
.
email
!=
""
:
send_mail
(
'[Gargantext] Votre Graph est calculé'
,
message
,
'team@gargantext.org'
,
[
user
.
email
],
fail_silently
=
False
)
else
:
print
(
"User
%
s (
%
d), has no email"
%
(
user
.
username
,
user
.
id
)
)
gargantext/graph/rest.py
deleted
100644 → 0
View file @
6a0506a4
This diff is collapsed.
Click to expand it.
gargantext/graph/templates/explorer.html
deleted
100644 → 0
View file @
6a0506a4
This diff is collapsed.
Click to expand it.
gargantext/graph/urls.py
deleted
100644 → 0
View file @
6a0506a4
from
django.conf.urls
import
url
# Module "Graph Explorer"
from
.rest
import
Graph
from
.views
import
explorer
,
myGraphs
from
.intersection
import
intersection
# TODO : factor urls
# url will have this pattern:
# ^explorer/$corpus_id/view
# ^explorer/$corpus_id/data.json
# ^explorer/$corpus_id/intersection
# GET ^api/projects/(\d+)/corpora/(\d+)/explorer$ -> data in json format
urlpatterns
=
[
url
(
r'^projects/(\d+)/corpora/(\d+)/explorer$'
,
explorer
)
,
url
(
r'^projects/(\d+)/corpora/(\d+)/myGraphs$'
,
myGraphs
)
,
url
(
r'^explorer/intersection/(\w+)$'
,
intersection
)
]
gargantext/graph/utils.py
deleted
100644 → 0
View file @
6a0506a4
def
compress_graph
(
graphdata
):
"""
graph data is usually a dict with 2 slots:
"nodes": [{"id":4103, "type":"terms", "attributes":{"clust_default": 0}, "size":29, "label":"regard"},...]
"links": [{"t": 998,"s": 768,"w": 0.0425531914893617},...]
To send this data over the net, this function can reduce a lot of its size:
- keep less decimals for float value of each link's weight
- use shorter names for node properties (eg: s/clust_default/cl/)
result format:
"nodes": [{"id":4103, "at":{"cl": 0}, "s":29, "lb":"regard"},...]
"links": [{"t": 998,"s": 768,"w": 0.042},...]
"""
for
link
in
graphdata
[
'links'
]:
link
[
'w'
]
=
format
(
link
[
'w'
],
'.3f'
)
# keep only 3 decimals
for
node
in
graphdata
[
'nodes'
]:
node
[
'lb'
]
=
node
[
'label'
]
del
node
[
'label'
]
#node['attributes']['growth'] = 0.8
node
[
'at'
]
=
node
[
'attributes'
]
del
node
[
'attributes'
]
node
[
'at'
][
'cl'
]
=
node
[
'at'
][
'clust_default'
]
del
node
[
'at'
][
'clust_default'
]
node
[
's'
]
=
node
[
'size'
]
del
node
[
'size'
]
if
node
[
'type'
]
==
"terms"
:
# its the default type for our format: so we don't need it
del
node
[
'type'
]
else
:
node
[
't'
]
=
node
[
'type'
]
del
node
[
'type'
]
return
graphdata
def
format_html
(
link
):
"""
Build an html link adapted to our json message format
"""
return
"<a class='msglink' href='
%
s'>
%
s</a>"
%
(
link
,
link
)
gargantext/graph/views.py
deleted
100644 → 0
View file @
6a0506a4
from
gargantext.util.http
import
*
from
gargantext.util.db
import
*
from
gargantext.util.db_cache
import
cache
from
gargantext.models
import
*
from
gargantext.constants
import
*
from
gargantext.settings
import
*
from
gargantext.constants
import
USER_LANG
from
datetime
import
datetime
from
gargantext.views.pages.main
import
get_user_params
@
requires_auth
def
explorer
(
request
,
project_id
,
corpus_id
):
'''
Graph explorer, also known as TinaWebJS, using SigmaJS.
Nodes are ngrams (from title or abstract or journal name.
Links represent proximity measure.
Data are received in RESTfull mode (see rest.py).
'''
# we pass our corpus
corpus
=
cache
.
Node
[
corpus_id
]
# security check
user
=
cache
.
User
[
request
.
user
.
id
]
if
corpus
is
None
:
raise
Http404
()
if
not
user
.
owns
(
corpus
):
return
HttpResponseForbidden
()
# get the maplist_id for modifications
maplist_id
=
corpus
.
children
(
typename
=
"MAPLIST"
)
.
first
()
.
id
# and the project just for project.id in corpusBannerTop
project
=
cache
.
Node
[
project_id
]
# rendered page : explorer.html
return
render
(
template_name
=
'explorer.html'
,
request
=
request
,
context
=
{
'debug'
:
settings
.
DEBUG
,
'request'
:
request
,
'user'
:
request
.
user
,
'date'
:
datetime
.
now
()
,
'project'
:
project
,
'corpus'
:
corpus
,
'maplist_id'
:
maplist_id
,
'view'
:
'graph'
,
'user_parameters'
:
get_user_params
(
request
.
user
),
'languages'
:
USER_LANG
},
)
@
requires_auth
def
myGraphs
(
request
,
project_id
,
corpus_id
):
'''
List all of my Graphs.
Each Graphs as one Node of Cooccurrences.
Each Graph is save in hyperdata of each Node.
'''
user
=
cache
.
User
[
request
.
user
.
id
]
# we pass our corpus
corpus
=
cache
.
Node
[
corpus_id
]
# and the project just for project.id in corpusBannerTop
project
=
cache
.
Node
[
project_id
]
coocs
=
corpus
.
children
(
'COOCCURRENCES'
,
order
=
True
)
.
all
()
coocs_count
=
dict
()
for
cooc
in
coocs
:
# FIXME : approximativ number of nodes (not exactly what user sees in explorer)
# Need to be connected with Graph Clustering
cooc_nodes
=
(
session
.
query
(
Ngram
.
id
,
func
.
count
(
Ngram
.
id
))
.
join
(
NodeNgramNgram
,
NodeNgramNgram
.
ngram1_id
==
Ngram
.
id
)
.
filter
(
NodeNgramNgram
.
node_id
==
cooc
.
id
)
.
filter
(
NodeNgramNgram
.
weight
>=
1
)
.
group_by
(
Ngram
.
id
)
.
all
()
)
#coocs_count[cooc.id] = len(cooc_nodes)
coocs_count
[
cooc
.
id
]
=
len
([
cooc_node
for
cooc_node
in
cooc_nodes
if
cooc_node
[
1
]
>
1
])
print
(
"coocs_count a posteriori"
,
coocs_count
)
return
render
(
template_name
=
'pages/corpora/myGraphs.html'
,
request
=
request
,
context
=
{
'debug'
:
settings
.
DEBUG
,
'request'
:
request
,
'user'
:
request
.
user
,
'date'
:
datetime
.
now
(),
'project'
:
project
,
'resourcename'
:
get_resource_by_name
(
corpus
),
'corpus'
:
corpus
,
'view'
:
'myGraph'
,
'coocs'
:
coocs
,
'coocs_count'
:
coocs_count
,
'user_parameters'
:
get_user_params
(
request
.
user
),
'languages'
:
USER_LANG
,
},
)
gargantext/settings.py
View file @
ce761204
...
...
@@ -43,7 +43,6 @@ CELERYBEAT_SCHEDULER = 'djcelery.schedulers.DatabaseScheduler'
CELERY_IMPORTS
=
(
"gargantext.util.toolchain"
,
"gargantext.util.crawlers"
,
"gargantext.graph.graph"
,
"gargantext.moissonneurs.pubmed"
,
"gargantext.moissonneurs.istex"
,
"gargantext.util.ngramlists_tools"
,
...
...
@@ -65,7 +64,6 @@ INSTALLED_APPS = [
'rest_framework'
,
'djcelery'
,
'gargantext.annotations'
,
'gargantext.graph'
,
'gargantext.moissonneurs'
,
'gargantext'
,
]
...
...
gargantext/urls.py
View file @
ce761204
...
...
@@ -5,7 +5,6 @@ Views are shared between these modules:
- `pages`, to present HTML views to the user
- `contents`, for Python-generated contents
- `annotations`, to annotate local context of a corpus (as global context)
- `graph explorer`, to explore graphs
"""
from
django.conf.urls
import
include
,
url
...
...
@@ -21,9 +20,6 @@ import gargantext.views.pages.urls
from
gargantext.annotations
import
urls
as
annotations_urls
from
gargantext.annotations.views
import
main
as
annotations_main_view
# Module for graph service
import
gargantext.graph.urls
# Module Scrapers
import
gargantext.moissonneurs.urls
...
...
@@ -34,9 +30,6 @@ urlpatterns = [ url(r'^admin/' , admin.site.urls
,
url
(
r'^favicon.ico$'
,
Redirect
.
as_view
(
url
=
static
.
url
(
'favicon.ico'
)
,
permanent
=
False
),
name
=
"favicon"
)
# Module Graph
,
url
(
r'^'
,
include
(
gargantext
.
graph
.
urls
)
)
# Module Annotation
# tempo: unchanged doc-annotations routes --
,
url
(
r'^annotations/'
,
include
(
annotations_urls
)
)
...
...
gargantext/views/api/urls.py
View file @
ce761204
...
...
@@ -10,7 +10,7 @@ from . import ngrams
from
.
import
metrics
from
.
import
ngramlists
from
.
import
analytics
from
gargantext.graph.rest
import
Graph
urlpatterns
=
[
url
(
r'^nodes$'
,
nodes
.
NodeListResource
.
as_view
())
,
url
(
r'^nodes/(\d+)$'
,
nodes
.
NodeResource
.
as_view
())
...
...
@@ -37,14 +37,6 @@ urlpatterns = [ url(r'^nodes$' , nodes.NodeListResource.as_view()
# Metrics
,
url
(
r'^projects/(\d+)/corpora/(\d+)/metrics$'
,
metrics
.
CorpusMetrics
.
as_view
())
# GraphExplorer
,
url
(
r'^projects/(\d+)/corpora/(\d+)/explorer$'
,
Graph
.
as_view
())
# data for graph explorer (json)
# GET /api/projects/43198/corpora/111107/explorer?
# Corresponding view is : /projects/43198/corpora/111107/explorer?
# Parameters (example):
# explorer?field1=ngrams&field2=ngrams&distance=conditional&bridgeness=5&start=1996-6-1&end=2002-10-5
# Ngrams
,
url
(
r'^ngrams/?$'
,
ngrams
.
ApiNgrams
.
as_view
())
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment