Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
df88008f
Commit
df88008f
authored
Oct 19, 2015
by
delanoe
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEAT] adding type to nodes (bipartite)
parent
657e4703
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
34 additions
and
41 deletions
+34
-41
cooccurrences.py
analysis/cooccurrences.py
+3
-9
functions.py
analysis/functions.py
+31
-32
No files found.
analysis/cooccurrences.py
View file @
df88008f
...
...
@@ -15,7 +15,8 @@ def do_cooc(corpus=None
,
cvalue_id
=
None
,
n_min
=
2
,
n_max
=
None
,
start
=
None
,
end
=
None
,
limit
=
1000
):
,
limit
=
1000
,
isMonopartite
=
True
):
'''
Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
For the moment list of paramters are not supported because, lists need to
...
...
@@ -65,13 +66,6 @@ def do_cooc(corpus=None
doc_id
=
cache
.
NodeType
[
'Document'
]
.
id
if
field1
==
field2
==
'ngrams'
:
isMonopartite
=
True
else
:
isMonopartite
=
False
hyperdata_id
=
session
.
query
(
Hyperdata
)
.
filter
(
Hyperdata
.
name
==
'source'
)
.
first
()
.
id
test_query
=
(
session
.
query
(
NodeHyperdataNgram
)
.
join
(
Node
,
Node
.
id
==
NodeHyperdataNgram
.
node_id
)
...
...
analysis/functions.py
View file @
df88008f
...
...
@@ -30,30 +30,27 @@ def diag_null(x):
return
x
-
x
*
scipy
.
eye
(
x
.
shape
[
0
])
def
do_distance
(
cooc_id
):
def
do_distance
(
cooc_id
,
field1
=
None
,
field2
=
None
,
isMonopartite
=
True
):
'''
do_distance :: Int -> (Graph, Partition, {ids}, {weight})
'''
#print([n for n in session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooc_id).all()])
matrix
=
defaultdict
(
lambda
:
defaultdict
(
float
))
ids
=
d
ict
(
)
ids
=
d
efaultdict
(
lambda
:
defaultdict
(
int
)
)
labels
=
dict
()
weight
=
dict
()
Cooc
=
aliased
(
NodeNgramNgram
)
query
=
session
.
query
(
Cooc
)
.
filter
(
Cooc
.
node_id
==
cooc_id
)
.
all
()
#print(query)
for
cooc
in
query
:
labels
[
cooc
.
ngramx_id
]
=
cooc
.
ngramx_id
labels
[
cooc
.
ngramy_id
]
=
cooc
.
ngramy_id
for
cooc
in
query
:
matrix
[
cooc
.
ngramx_id
][
cooc
.
ngramy_id
]
=
cooc
.
score
matrix
[
cooc
.
ngramy_id
][
cooc
.
ngramx_id
]
=
cooc
.
score
ids
[
labels
[
cooc
.
ngramx_id
]]
=
cooc
.
ngramx_id
ids
[
labels
[
cooc
.
ngramy_id
]]
=
cooc
.
ngramy_id
ids
[
cooc
.
ngramx_id
]
=
(
field1
,
cooc
.
ngramx_id
)
ids
[
cooc
.
ngramy_id
]
=
(
field2
,
cooc
.
ngramy_id
)
weight
[
cooc
.
ngramx_id
]
=
weight
.
get
(
cooc
.
ngramx_id
,
0
)
+
cooc
.
score
weight
[
cooc
.
ngramy_id
]
=
weight
.
get
(
cooc
.
ngramy_id
,
0
)
+
cooc
.
score
...
...
@@ -66,7 +63,6 @@ def do_distance(cooc_id):
x
=
x
/
x
.
sum
(
axis
=
1
)
y
=
y
/
y
.
sum
(
axis
=
0
)
#print(x)
xs
=
x
.
sum
(
axis
=
1
)
-
x
ys
=
x
.
sum
(
axis
=
0
)
-
x
...
...
@@ -79,16 +75,13 @@ def do_distance(cooc_id):
n
=
n
.
sort
(
inplace
=
False
)
m
=
m
.
sort
(
inplace
=
False
)
#print(n)
#print(m)
nodes_included
=
300
#int(round(size/20,0))
#nodes_excluded = int(round(size/10,0))
nodes_specific
=
300
#int(round(size/10,0))
#nodes_generic = int(round(size/10,0))
# TODO use
r
the included score for the node size
# TODO use the included score for the node size
n_index
=
pd
.
Index
.
intersection
(
x
.
index
,
n
.
index
[:
nodes_included
])
# Generic:
#m_index = pd.Index.intersection(x.index, m.index[:nodes_generic])
...
...
@@ -98,9 +91,6 @@ def do_distance(cooc_id):
x_index
=
pd
.
Index
.
union
(
n_index
,
m_index
)
xx
=
x
[
list
(
x_index
)]
.
T
[
list
(
x_index
)]
# import pprint
# pprint.pprint(ids)
# Removing unconnected nodes
xxx
=
xx
.
values
threshold
=
min
(
xxx
.
max
(
axis
=
1
))
...
...
@@ -110,22 +100,26 @@ def do_distance(cooc_id):
G
=
nx
.
from_numpy_matrix
(
np
.
matrix
(
matrix_filtered
))
#G = nx.from_numpy_matrix(matrix_filtered, create_using=nx.MultiDiGraph())
G
=
nx
.
relabel_nodes
(
G
,
dict
(
enumerate
([
labels
[
label
]
for
label
in
list
(
xx
.
columns
)])))
G
=
nx
.
relabel_nodes
(
G
,
dict
(
enumerate
([
ids
[
id_
][
1
]
for
id_
in
list
(
xx
.
columns
)])))
# Removing too connected nodes (find automatic way to do it)
#edges_to_remove = [ e for e in G.edges_iter() if
degree
=
G
.
degree
()
G
.
remove_nodes_from
(
nx
.
isolates
(
G
))
#nodes_to_remove = [n for n in degree if degree[n] <= 1]
#G.remove_nodes_from(nodes_to_remove)
# = degree = G.degree()
# nodes_to_remove = [n for n in degree if degree[n] <= 1]
# G.remove_nodes_from(nodes_to_remove)
partition
=
best_partition
(
G
.
to_undirected
())
print
(
"Density of the graph:"
,
nx
.
density
(
G
))
return
(
G
,
partition
,
ids
,
weight
)
def
get_cooc
(
request
=
None
,
corpus
=
None
,
field1
=
'ngrams'
,
field2
=
'ngrams'
,
cooc_id
=
None
,
type
=
'node_link'
,
size
=
1000
):
,
cooc_id
=
None
,
type
=
'node_link'
,
size
=
1000
,
start
=
None
,
end
=
None
):
'''
get_ccoc : to compute the graph.
'''
...
...
@@ -135,24 +129,29 @@ def get_cooc(request=None, corpus=None
stop_id
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
)
.
id
group_id
=
get_or_create_node
(
nodetype
=
'Group'
,
corpus
=
corpus
)
.
id
if
field1
==
field2
==
'ngrams'
:
isMonopartite
=
True
else
:
isMonopartite
=
False
# data deleted each time
#cooc_id = get_or_create_node(nodetype='Cooccurrence', corpus=corpus).id
#session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooc_id).delete()
cooc_id
=
do_cooc
(
corpus
=
corpus
,
field1
=
field1
,
field2
=
field2
,
miam_id
=
miam_id
,
group_id
=
group_id
,
stop_id
=
stop_id
,
limit
=
size
)
,
miam_id
=
miam_id
,
group_id
=
group_id
,
stop_id
=
stop_id
,
limit
=
size
,
isMonopartite
=
isMonopartite
)
G
,
partition
,
ids
,
weight
=
do_distance
(
cooc_id
)
G
,
partition
,
ids
,
weight
=
do_distance
(
cooc_id
,
field1
=
field1
,
field2
=
field2
,
isMonopartite
=
isMonopartite
)
if
type
==
"node_link"
:
for
node
in
G
.
nodes
():
for
node_id
in
G
.
nodes
():
try
:
#node,type(labels[node])
G
.
node
[
node
][
'pk'
]
=
ids
[
node
]
G
.
node
[
node
][
'label'
]
=
session
.
query
(
Ngram
.
terms
)
.
filter
(
Ngram
.
id
==
node
)
.
first
()
G
.
node
[
node
][
'size'
]
=
weight
[
ids
[
node
]
]
G
.
node
[
node
][
'type'
]
=
"NGrams"
G
.
node
[
node
][
'attributes'
]
=
{
"clust_default"
:
partition
[
node
]}
# new format
G
.
node
[
node
_id
][
'pk'
]
=
ids
[
node_id
][
1
]
G
.
node
[
node
_id
][
'label'
]
=
session
.
query
(
Ngram
.
terms
)
.
filter
(
Ngram
.
id
==
node_id
)
.
first
()
G
.
node
[
node
_id
][
'size'
]
=
weight
[
node_id
]
G
.
node
[
node
_id
][
'type'
]
=
ids
[
node_id
][
0
]
#G.node[node]['attributes'] = { "clust_default": partition[node_id
]} # new format
# G.add_edge(node, "cluster " + str(partition[node]), weight=3)
except
Exception
as
error
:
pass
#PrintException()
...
...
@@ -165,7 +164,7 @@ def get_cooc(request=None, corpus=None
for
e
in
G
.
edges_iter
():
s
=
e
[
0
]
t
=
e
[
1
]
info
=
{
"id"
:
i
,
"source"
:
ids
[
s
]
,
"target"
:
ids
[
t
]}
info
=
{
"id"
:
i
,
"source"
:
ids
[
s
]
[
1
]
,
"target"
:
ids
[
t
][
1
]}
# print(info)
links
.
append
(
info
)
i
+=
1
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment