Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
3f0cacb3
Commit
3f0cacb3
authored
Nov 04, 2014
by
Administrator
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[TESTS] new file ipynb exported
parent
7bafa359
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
542 additions
and
0 deletions
+542
-0
AlexandreTests.py
tests/AlexandreTests.py
+542
-0
No files found.
tests/AlexandreTests.py
0 → 100644
View file @
3f0cacb3
# coding: utf-8
# In[1]:
from
node.models
import
Node
,
NodeType
,
Project
,
Corpus
,
Document
,
Ngram
,
Node_Ngram
,
User
,
Language
,
ResourceType
# In[2]:
import
pycountry
for
language
in
pycountry
.
languages
:
try
:
implemented
=
1
if
language
.
alpha2
in
[
'en'
,
'fr'
]
else
0
Language
(
iso2
=
language
.
alpha2
,
iso3
=
language
.
terminology
,
fullname
=
language
.
name
,
implemented
=
implemented
)
.
save
()
except
:
pass
# In[3]:
english
=
Language
.
objects
.
get
(
iso2
=
'en'
)
french
=
Language
.
objects
.
get
(
iso2
=
'fr'
)
# In[4]:
try
:
me
=
User
.
objects
.
get
(
username
=
'alexandre'
)
except
:
me
=
User
(
username
=
'alexandre'
)
me
.
save
()
# In[5]:
try
:
typeProject
=
NodeType
.
objects
.
get
(
name
=
'Project'
)
except
Exception
as
error
:
print
(
error
)
typeProject
=
NodeType
(
name
=
'Project'
)
typeProject
.
save
()
try
:
typeCorpus
=
NodeType
.
objects
.
get
(
name
=
'Corpus'
)
except
Exception
as
error
:
print
(
error
)
typeCorpus
=
NodeType
(
name
=
'Corpus'
)
typeCorpus
.
save
()
try
:
typeDoc
=
NodeType
.
objects
.
get
(
name
=
'Document'
)
except
Exception
as
error
:
print
(
error
)
typeDoc
=
NodeType
(
name
=
'Document'
)
typeDoc
.
save
()
# In[6]:
try
:
typePubmed
=
ResourceType
.
objects
.
get
(
name
=
'pubmed'
)
typeIsi
=
ResourceType
.
objects
.
get
(
name
=
'isi'
)
typeRis
=
ResourceType
.
objects
.
get
(
name
=
'ris'
)
typePresse
=
ResourceType
.
objects
.
get
(
name
=
'europress'
)
except
Exception
as
error
:
print
(
error
)
typePubmed
=
ResourceType
(
name
=
'pubmed'
)
typePubmed
.
save
()
typeIsi
=
ResourceType
(
name
=
'isi'
)
typeIsi
.
save
()
typeRis
=
ResourceType
(
name
=
'ris'
)
typeRis
.
save
()
typePresse
=
ResourceType
(
name
=
'europress'
)
typePresse
.
save
()
# In[10]:
Node
.
objects
.
all
()
.
delete
()
# In[8]:
try
:
project
=
Node
.
objects
.
get
(
name
=
'Bees project'
)
except
:
project
=
Node
(
name
=
'Bees project'
,
type
=
typeProject
,
user
=
me
)
project
.
save
()
# ### Pubmed
# In[18]:
try
:
corpus_pubmed
=
Node
.
objects
.
get
(
name
=
'PubMed corpus'
)
except
:
corpus_pubmed
=
Node
(
parent
=
project
,
name
=
'PubMed corpus'
,
type
=
typeCorpus
,
user
=
me
)
corpus_pubmed
.
save
()
# In[19]:
corpus_pubmed
.
add_resource
(
file
=
'/srv/gargantext_lib/data_samples/pubmedBig.zip'
,
type
=
typePubmed
)
# In[20]:
#corpus_abeille.add_resource(file='/srv/gargantext_lib/data_samples/pubmed.zip', type=typePubmed)
# In[21]:
corpus_pubmed
.
parse_resources
()
corpus_pubmed
.
children
.
count
()
# In[22]:
corpus_pubmed
.
children
.
all
()
.
extract_ngrams
([
'title'
,])
Node_Ngram
.
objects
.
filter
(
node
=
corpus_pubmed
.
children
.
all
()[
0
])
.
count
()
# ### RIS
# In[9]:
try
:
corpus_ris
=
Node
.
objects
.
get
(
name
=
'RIS corpus'
)
except
:
corpus_ris
=
Node
(
parent
=
project
,
name
=
'RIS corpus'
,
type
=
typeCorpus
,
user
=
me
)
corpus_ris
.
save
()
# In[10]:
corpus_ris
.
add_resource
(
file
=
'/srv/gargantext_lib/data_samples/risUnix.zip'
,
type
=
typeRis
)
# In[15]:
corpus_ris
.
parse_resources
()
# In[16]:
corpus_ris
.
children
.
count
()
# In[40]:
corpus_ris
.
children
.
all
()
# In[28]:
corpus_ris
.
name
=
"ZOTERO CORPUS (CIRDEM)"
corpus_ris
.
save
()
# ### Science
# In[23]:
try
:
science
=
Node
.
objects
.
get
(
name
=
'WOS corpus'
)
except
:
science
=
Node
(
parent
=
project
,
name
=
'WOS corpus'
,
type
=
typeCorpus
,
user
=
me
)
science
.
save
()
# In[24]:
science
.
add_resource
(
file
=
'/srv/gargantext_lib/data_samples/isi.zip'
,
type
=
typeIsi
)
science
.
parse_resources
()
science
.
children
.
count
()
# In[25]:
science
.
children
.
last
()
.
metadata
# In[26]:
science
.
children
.
all
()
.
extract_ngrams
([
'abstract'
,])
Node_Ngram
.
objects
.
filter
(
node
=
science
.
children
.
all
()[
0
])
.
count
()
# ### Press
# In[29]:
try
:
presse
=
Node
.
objects
.
get
(
name
=
'Presse corpus'
)
except
:
presse
=
Node
(
parent
=
project
,
name
=
'Presse corpus'
,
type
=
typeCorpus
,
user
=
me
)
presse
.
save
()
# In[30]:
presse
.
add_resource
(
file
=
'/srv/gargantext_lib/data_samples/html/html_french.zip'
,
type
=
typePresse
)
# In[31]:
presse
.
parse_resources
()
# In[32]:
presse
.
children
.
count
()
# In[33]:
presse
.
children
.
all
()
.
extract_ngrams
([
'title'
,])
# In[34]:
project
.
children
.
all
()
# In[37]:
corpus
.
children
.
all
()
# In[46]:
liste_ordered
=
collections
.
OrderedDict
(
sorted
(
liste
.
items
()),
key
=
lambda
t
:
t
[
1
])
# In[52]:
#liste_ordered
# # Création des Listes
# In[57]:
import
collections
# In[58]:
liste
=
collections
.
defaultdict
(
int
)
# In[59]:
try
:
whitelist_type
=
NodeType
.
objects
.
get
(
name
=
'WhiteList'
)
blacklist_type
=
NodeType
.
objects
.
get
(
name
=
'BlackList'
)
except
:
whitelist_type
=
NodeType
(
name
=
'WhiteList'
)
whitelist_type
.
save
()
blacklist_type
=
NodeType
(
name
=
'BlackList'
)
blacklist_type
.
save
()
white_node
=
Node
.
objects
.
create
(
name
=
'WhiteList Pubmed'
,
user
=
me
,
parent
=
corpus_pubmed
,
type
=
whitelist_type
)
black_node
=
Node
.
objects
.
create
(
name
=
'BlackList Pubmed'
,
user
=
me
,
parent
=
corpus_pubmed
,
type
=
blacklist_type
)
# In[60]:
Node_Ngram
.
objects
.
filter
(
node
=
white_node
)
.
count
()
# # Création de la white list
# In[61]:
with
transaction
.
atomic
():
for
node
in
corpus_pubmed
.
children
.
all
():
for
node_ngram
in
Node_Ngram
.
objects
.
filter
(
node
=
node
):
if
node_ngram
.
ngram
.
n
>
1
:
#liste[node_ngram.ngram.terms] += node_ngram.weight
Node_Ngram
.
objects
.
create
(
node
=
white_node
,
ngram
=
node_ngram
.
ngram
,
weight
=
1
)
# In[62]:
white_node
.
pk
# In[63]:
Node_Ngram
.
objects
.
filter
(
node
=
white_node
)
.
count
()
# # Création de la black list
# In[64]:
with
transaction
.
atomic
():
for
node_ngram_object
in
Node_Ngram
.
objects
.
all
()[
101
:
150
]:
Node_Ngram
.
objects
.
create
(
node
=
black_node
,
ngram
=
node_ngram_object
.
ngram
,
occurences
=
1
)
# In[12]:
Node_Ngram
.
objects
.
filter
(
node
=
black_node
)
# # Création des synonymes
# In[13]:
syno_type
=
NodeType
.
objects
.
get
(
name
=
'Synonyme'
)
syno_node
=
Node
.
objects
.
create
(
name
=
'Syno Pubmed'
,
user
=
user
,
parent
=
corpus
,
type
=
syno_type
)
# In[23]:
synonyme1
,
synonyme2
=
Node_Ngram
.
objects
.
filter
(
node
=
white_node
)[
3
:
5
]
# In[24]:
NodeNgramNgram
.
objects
.
create
(
node
=
syno_node
,
ngramX
=
synonyme1
.
ngram
,
ngramY
=
synonyme2
.
ngram
)
# # Cooccurrence
# In[65]:
white_node
.
children
.
count
()
# In[66]:
black_node
.
pk
# In[67]:
try
:
cooc_type
=
NodeType
.
objects
.
get
(
name
=
'Cooccurrence'
)
except
:
cooc_type
=
NodeType
(
name
=
'Cooccurrence'
)
cooc_type
.
save
()
# In[68]:
cooc
=
Node
.
objects
.
create
(
user
=
me
,
parent
=
corpus_pubmed
,
type
=
cooc_type
,
name
=
"Cooccurrences calcul Alpha"
)
# In[69]:
cooc
.
pk
# In[152]:
white_node
.
children
.
all
()
.
delete
()
# In[70]:
from
django.db
import
connection
cursor
=
connection
.
cursor
()
# LOCK TABLE documents_ngramtemporary IN EXCLUSIVE MODE;
query_string
=
"""
INSERT INTO node_nodengramngram (node_id, "ngramX_id", "ngramY_id", score)
SELECT
%
d as node_id, x.ngram_id, y.ngram_id, COUNT(*) AS score
FROM
node_node_ngram AS x
INNER JOIN
node_node_ngram AS y
ON x.node_id = y.node_id
WHERE
x.id in (select id from node_node_ngram WHERE node_id =
%
d )
AND
y.id in (select id from node_node_ngram WHERE node_id =
%
d )
AND
x.ngram_id <> y.ngram_id
GROUP BY
x.ngram_id, y.ngram_id
HAVING count(*) > 1
ORDER BY score
LIMIT 300
"""
%
(
cooc
.
pk
,
white_node
.
pk
,
white_node
.
pk
)
cursor
.
execute
(
query_string
)
try
:
while
True
:
row
=
cursor
.
fetchone
()
if
row
is
None
:
break
print
(
row
)
except
:
pass
# In[1]:
# In[45]:
# In[71]:
from
copy
import
copy
import
numpy
as
np
import
pandas
as
pd
import
networkx
as
nx
from
collections
import
defaultdict
from
analysis.louvain
import
*
import
matplotlib.pyplot
as
plt
get_ipython
()
.
magic
(
'matplotlib inline'
)
# In[ ]:
matrix
=
""
# In[72]:
matrix
=
defaultdict
(
lambda
:
defaultdict
(
float
))
for
cooccurrence
in
NodeNgramNgram
.
objects
.
filter
(
node
=
cooc
):
if
cooccurrence
.
score
>
1
:
#print(x.ngramX.terms, x.ngramY.terms)
matrix
[
cooccurrence
.
ngramX
.
terms
][
cooccurrence
.
ngramY
.
terms
]
=
cooccurrence
.
score
matrix
[
cooccurrence
.
ngramY
.
terms
][
cooccurrence
.
ngramX
.
terms
]
=
cooccurrence
.
score
# In[73]:
df
=
pd
.
DataFrame
(
matrix
)
.
T
.
fillna
(
0
)
x
=
copy
(
df
.
values
)
# In[74]:
x
=
np
.
where
((
x
.
sum
(
axis
=
1
)
>
x
.
shape
[
0
]
/
2
),
0
,
x
)
x
=
np
.
where
((
x
.
sum
(
axis
=
1
)
>
x
.
shape
[
0
]
/
10
),
0
,
x
)
# In[75]:
x
=
x
/
x
.
sum
(
axis
=
1
)
# In[76]:
matrix_filtered
=
np
.
where
(
x
>
.4
,
1
,
0
)
# In[77]:
matrix_filtered
# In[78]:
G
=
nx
.
from_numpy_matrix
(
matrix_filtered
)
G
=
nx
.
relabel_nodes
(
G
,
dict
(
enumerate
(
df
.
columns
)))
# In[79]:
nx
.
draw
(
G
,
with_labels
=
True
)
plt
.
show
()
# In[80]:
partition
=
best_partition
(
G
)
# In[ ]:
#partition
# In[81]:
pos
=
nx
.
spring_layout
(
G
)
# In[82]:
count
=
0.0
node_min
=
3
for
com
in
set
(
partition
.
values
())
:
count
=
count
+
1
list_nodes
=
[
nodes
for
nodes
in
partition
.
keys
()
if
partition
[
nodes
]
==
com
]
if
len
(
list_nodes
)
>
node_min
:
nx
.
draw_networkx_nodes
(
G
,
pos
,
list_nodes
,
node_size
=
20
,
with_labels
=
True
)
#, node_color = str(count / size))
nx
.
draw_networkx_edges
(
G
,
pos
,
alpha
=
0.5
)
plt
.
title
(
"Clique "
+
str
(
count
))
for
node
in
list_nodes
:
print
(
node
)
plt
.
show
()
print
(
"-"
*
30
)
# In[ ]:
# In[ ]:
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment