Commit 3f0cacb3 authored by Administrator's avatar Administrator

[TESTS] new file ipynb exported

parent 7bafa359
# coding: utf-8
# In[1]:
from node.models import Node, NodeType, Project, Corpus, Document, Ngram, Node_Ngram, User, Language, ResourceType
# In[2]:
import pycountry
for language in pycountry.languages:
try:
implemented = 1 if language.alpha2 in ['en', 'fr'] else 0
Language(iso2=language.alpha2, iso3=language.terminology, fullname=language.name, implemented=implemented).save()
except:
pass
# In[3]:
english = Language.objects.get(iso2='en')
french = Language.objects.get(iso2='fr')
# In[4]:
try:
me = User.objects.get(username='alexandre')
except:
me = User(username='alexandre')
me.save()
# In[5]:
try:
typeProject = NodeType.objects.get(name='Project')
except Exception as error:
print(error)
typeProject = NodeType(name='Project')
typeProject.save()
try:
typeCorpus = NodeType.objects.get(name='Corpus')
except Exception as error:
print(error)
typeCorpus = NodeType(name='Corpus')
typeCorpus.save()
try:
typeDoc = NodeType.objects.get(name='Document')
except Exception as error:
print(error)
typeDoc = NodeType(name='Document')
typeDoc.save()
# In[6]:
try:
typePubmed = ResourceType.objects.get(name='pubmed')
typeIsi = ResourceType.objects.get(name='isi')
typeRis = ResourceType.objects.get(name='ris')
typePresse = ResourceType.objects.get(name='europress')
except Exception as error:
print(error)
typePubmed = ResourceType(name='pubmed')
typePubmed.save()
typeIsi = ResourceType(name='isi')
typeIsi.save()
typeRis = ResourceType(name='ris')
typeRis.save()
typePresse = ResourceType(name='europress')
typePresse.save()
# In[10]:
Node.objects.all().delete()
# In[8]:
try:
project = Node.objects.get(name='Bees project')
except:
project = Node(name='Bees project', type=typeProject, user=me)
project.save()
# ### Pubmed
# In[18]:
try:
corpus_pubmed = Node.objects.get(name='PubMed corpus')
except:
corpus_pubmed = Node(parent=project, name='PubMed corpus', type=typeCorpus, user=me)
corpus_pubmed.save()
# In[19]:
corpus_pubmed.add_resource(file='/srv/gargantext_lib/data_samples/pubmedBig.zip', type=typePubmed)
# In[20]:
#corpus_abeille.add_resource(file='/srv/gargantext_lib/data_samples/pubmed.zip', type=typePubmed)
# In[21]:
corpus_pubmed.parse_resources()
corpus_pubmed.children.count()
# In[22]:
corpus_pubmed.children.all().extract_ngrams(['title',])
Node_Ngram.objects.filter(node=corpus_pubmed.children.all()[0]).count()
# ### RIS
# In[9]:
try:
corpus_ris = Node.objects.get(name='RIS corpus')
except:
corpus_ris = Node(parent=project, name='RIS corpus', type=typeCorpus, user=me)
corpus_ris.save()
# In[10]:
corpus_ris.add_resource(file='/srv/gargantext_lib/data_samples/risUnix.zip', type=typeRis)
# In[15]:
corpus_ris.parse_resources()
# In[16]:
corpus_ris.children.count()
# In[40]:
corpus_ris.children.all()
# In[28]:
corpus_ris.name = "ZOTERO CORPUS (CIRDEM)"
corpus_ris.save()
# ### Science
# In[23]:
try:
science = Node.objects.get(name='WOS corpus')
except:
science = Node(parent=project, name='WOS corpus', type=typeCorpus, user=me)
science.save()
# In[24]:
science.add_resource(file='/srv/gargantext_lib/data_samples/isi.zip', type=typeIsi)
science.parse_resources()
science.children.count()
# In[25]:
science.children.last().metadata
# In[26]:
science.children.all().extract_ngrams(['abstract',])
Node_Ngram.objects.filter(node=science.children.all()[0]).count()
# ### Press
# In[29]:
try:
presse = Node.objects.get(name='Presse corpus')
except:
presse = Node(parent=project, name='Presse corpus', type=typeCorpus, user=me)
presse.save()
# In[30]:
presse.add_resource(file='/srv/gargantext_lib/data_samples/html/html_french.zip', type=typePresse)
# In[31]:
presse.parse_resources()
# In[32]:
presse.children.count()
# In[33]:
presse.children.all().extract_ngrams(['title',])
# In[34]:
project.children.all()
# In[37]:
corpus.children.all()
# In[46]:
liste_ordered = collections.OrderedDict(sorted(liste.items()), key=lambda t: t[1])
# In[52]:
#liste_ordered
# # Création des Listes
# In[57]:
import collections
# In[58]:
liste = collections.defaultdict(int)
# In[59]:
try:
whitelist_type = NodeType.objects.get(name='WhiteList')
blacklist_type = NodeType.objects.get(name='BlackList')
except:
whitelist_type = NodeType(name='WhiteList')
whitelist_type.save()
blacklist_type = NodeType(name='BlackList')
blacklist_type.save()
white_node = Node.objects.create(name='WhiteList Pubmed', user=me, parent=corpus_pubmed, type=whitelist_type)
black_node = Node.objects.create(name='BlackList Pubmed', user=me, parent=corpus_pubmed, type=blacklist_type)
# In[60]:
Node_Ngram.objects.filter(node=white_node).count()
# # Création de la white list
# In[61]:
with transaction.atomic():
for node in corpus_pubmed.children.all():
for node_ngram in Node_Ngram.objects.filter(node=node):
if node_ngram.ngram.n > 1:
#liste[node_ngram.ngram.terms] += node_ngram.weight
Node_Ngram.objects.create(node=white_node, ngram=node_ngram.ngram, weight=1)
# In[62]:
white_node.pk
# In[63]:
Node_Ngram.objects.filter(node=white_node).count()
# # Création de la black list
# In[64]:
with transaction.atomic():
for node_ngram_object in Node_Ngram.objects.all()[101:150]:
Node_Ngram.objects.create(node=black_node, ngram=node_ngram_object.ngram, occurences=1)
# In[12]:
Node_Ngram.objects.filter(node=black_node)
# # Création des synonymes
# In[13]:
syno_type = NodeType.objects.get(name='Synonyme')
syno_node = Node.objects.create(name='Syno Pubmed',
user=user,
parent=corpus,
type=syno_type)
# In[23]:
synonyme1, synonyme2 = Node_Ngram.objects.filter(node=white_node)[3:5]
# In[24]:
NodeNgramNgram.objects.create(node=syno_node, ngramX=synonyme1.ngram, ngramY=synonyme2.ngram)
# # Cooccurrence
# In[65]:
white_node.children.count()
# In[66]:
black_node.pk
# In[67]:
try:
cooc_type = NodeType.objects.get(name='Cooccurrence')
except:
cooc_type = NodeType(name='Cooccurrence')
cooc_type.save()
# In[68]:
cooc = Node.objects.create(user=me, parent=corpus_pubmed, type=cooc_type, name="Cooccurrences calcul Alpha")
# In[69]:
cooc.pk
# In[152]:
white_node.children.all().delete()
# In[70]:
from django.db import connection
cursor = connection.cursor()
# LOCK TABLE documents_ngramtemporary IN EXCLUSIVE MODE;
query_string = """
INSERT INTO node_nodengramngram (node_id, "ngramX_id", "ngramY_id", score)
SELECT
%d as node_id, x.ngram_id, y.ngram_id, COUNT(*) AS score
FROM
node_node_ngram AS x
INNER JOIN
node_node_ngram AS y
ON x.node_id = y.node_id
WHERE
x.id in (select id from node_node_ngram WHERE node_id = %d )
AND
y.id in (select id from node_node_ngram WHERE node_id = %d )
AND
x.ngram_id <> y.ngram_id
GROUP BY
x.ngram_id, y.ngram_id
HAVING count(*) > 1
ORDER BY score
LIMIT 300
""" % (cooc.pk, white_node.pk, white_node.pk)
cursor.execute(query_string)
try:
while True:
row = cursor.fetchone()
if row is None:
break
print(row)
except:
pass
# In[1]:
# In[45]:
# In[71]:
from copy import copy
import numpy as np
import pandas as pd
import networkx as nx
from collections import defaultdict
from analysis.louvain import *
import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')
# In[ ]:
matrix = ""
# In[72]:
matrix = defaultdict(lambda : defaultdict(float))
for cooccurrence in NodeNgramNgram.objects.filter(node=cooc):
if cooccurrence.score > 1 :
#print(x.ngramX.terms, x.ngramY.terms)
matrix[cooccurrence.ngramX.terms][cooccurrence.ngramY.terms] = cooccurrence.score
matrix[cooccurrence.ngramY.terms][cooccurrence.ngramX.terms] = cooccurrence.score
# In[73]:
df = pd.DataFrame(matrix).T.fillna(0)
x = copy(df.values)
# In[74]:
x = np.where((x.sum(axis=1) > x.shape[0] / 2), 0, x )
x = np.where((x.sum(axis=1) > x.shape[0] / 10), 0, x )
# In[75]:
x = x / x.sum(axis=1)
# In[76]:
matrix_filtered = np.where(x > .4, 1, 0)
# In[77]:
matrix_filtered
# In[78]:
G = nx.from_numpy_matrix(matrix_filtered)
G = nx.relabel_nodes(G, dict(enumerate(df.columns)))
# In[79]:
nx.draw(G, with_labels=True)
plt.show()
# In[80]:
partition = best_partition(G)
# In[ ]:
#partition
# In[81]:
pos = nx.spring_layout(G)
# In[82]:
count = 0.0
node_min = 3
for com in set(partition.values()) :
count = count + 1
list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == com]
if len(list_nodes) > node_min:
nx.draw_networkx_nodes(G, pos, list_nodes, node_size = 20, with_labels=True)#, node_color = str(count / size))
nx.draw_networkx_edges(G, pos, alpha=0.5)
plt.title("Clique " + str(count))
for node in list_nodes:
print(node)
plt.show()
print("-" * 30)
# In[ ]:
# In[ ]:
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment