# coding: utf-8 # In[1]: from node.models import Node, NodeType, Project, Corpus, Document, Ngram, Node_Ngram, User, Language, ResourceType # In[2]: import pycountry for language in pycountry.languages: try: implemented = 1 if language.alpha2 in ['en', 'fr'] else 0 Language(iso2=language.alpha2, iso3=language.terminology, fullname=language.name, implemented=implemented).save() except: pass # In[3]: english = Language.objects.get(iso2='en') french = Language.objects.get(iso2='fr') # In[4]: try: me = User.objects.get(username='alexandre') except: me = User(username='alexandre') me.save() # In[5]: try: typeProject = NodeType.objects.get(name='Project') except Exception as error: print(error) typeProject = NodeType(name='Project') typeProject.save() try: typeCorpus = NodeType.objects.get(name='Corpus') except Exception as error: print(error) typeCorpus = NodeType(name='Corpus') typeCorpus.save() try: typeDoc = NodeType.objects.get(name='Document') except Exception as error: print(error) typeDoc = NodeType(name='Document') typeDoc.save() # In[6]: try: typePubmed = ResourceType.objects.get(name='pubmed') typeIsi = ResourceType.objects.get(name='isi') typeRis = ResourceType.objects.get(name='ris') typePresse = ResourceType.objects.get(name='europress') except Exception as error: print(error) typePubmed = ResourceType(name='pubmed') typePubmed.save() typeIsi = ResourceType(name='isi') typeIsi.save() typeRis = ResourceType(name='ris') typeRis.save() typePresse = ResourceType(name='europress') typePresse.save() # In[10]: Node.objects.all().delete() # In[8]: try: project = Node.objects.get(name='Bees project') except: project = Node(name='Bees project', type=typeProject, user=me) project.save() # ### Pubmed # In[18]: try: corpus_pubmed = Node.objects.get(name='PubMed corpus') except: corpus_pubmed = Node(parent=project, name='PubMed corpus', type=typeCorpus, user=me) corpus_pubmed.save() # In[19]: corpus_pubmed.add_resource(file='/srv/gargantext_lib/data_samples/pubmedBig.zip', type=typePubmed) # In[20]: #corpus_abeille.add_resource(file='/srv/gargantext_lib/data_samples/pubmed.zip', type=typePubmed) # In[21]: corpus_pubmed.parse_resources() corpus_pubmed.children.count() # In[22]: corpus_pubmed.children.all().extract_ngrams(['title',]) Node_Ngram.objects.filter(node=corpus_pubmed.children.all()[0]).count() # ### RIS # In[9]: try: corpus_ris = Node.objects.get(name='RIS corpus') except: corpus_ris = Node(parent=project, name='RIS corpus', type=typeCorpus, user=me) corpus_ris.save() # In[10]: corpus_ris.add_resource(file='/srv/gargantext_lib/data_samples/risUnix.zip', type=typeRis) # In[15]: corpus_ris.parse_resources() # In[16]: corpus_ris.children.count() # In[40]: corpus_ris.children.all() # In[28]: corpus_ris.name = "ZOTERO CORPUS (CIRDEM)" corpus_ris.save() # ### Science # In[23]: try: science = Node.objects.get(name='WOS corpus') except: science = Node(parent=project, name='WOS corpus', type=typeCorpus, user=me) science.save() # In[24]: science.add_resource(file='/srv/gargantext_lib/data_samples/isi.zip', type=typeIsi) science.parse_resources() science.children.count() # In[25]: science.children.last().metadata # In[26]: science.children.all().extract_ngrams(['abstract',]) Node_Ngram.objects.filter(node=science.children.all()[0]).count() # ### Press # In[29]: try: presse = Node.objects.get(name='Presse corpus') except: presse = Node(parent=project, name='Presse corpus', type=typeCorpus, user=me) presse.save() # In[30]: presse.add_resource(file='/srv/gargantext_lib/data_samples/html/html_french.zip', type=typePresse) # In[31]: presse.parse_resources() # In[32]: presse.children.count() # In[33]: presse.children.all().extract_ngrams(['title',]) # In[34]: project.children.all() # In[37]: corpus.children.all() # In[46]: liste_ordered = collections.OrderedDict(sorted(liste.items()), key=lambda t: t[1]) # In[52]: #liste_ordered # # Création des Listes # In[57]: import collections # In[58]: liste = collections.defaultdict(int) # In[59]: try: whitelist_type = NodeType.objects.get(name='WhiteList') blacklist_type = NodeType.objects.get(name='BlackList') except: whitelist_type = NodeType(name='WhiteList') whitelist_type.save() blacklist_type = NodeType(name='BlackList') blacklist_type.save() white_node = Node.objects.create(name='WhiteList Pubmed', user=me, parent=corpus_pubmed, type=whitelist_type) black_node = Node.objects.create(name='BlackList Pubmed', user=me, parent=corpus_pubmed, type=blacklist_type) # In[60]: Node_Ngram.objects.filter(node=white_node).count() # # Création de la white list # In[61]: with transaction.atomic(): for node in corpus_pubmed.children.all(): for node_ngram in Node_Ngram.objects.filter(node=node): if node_ngram.ngram.n > 1: #liste[node_ngram.ngram.terms] += node_ngram.weight Node_Ngram.objects.create(node=white_node, ngram=node_ngram.ngram, weight=1) # In[62]: white_node.pk # In[63]: Node_Ngram.objects.filter(node=white_node).count() # # Création de la black list # In[64]: with transaction.atomic(): for node_ngram_object in Node_Ngram.objects.all()[101:150]: Node_Ngram.objects.create(node=black_node, ngram=node_ngram_object.ngram, occurences=1) # In[12]: Node_Ngram.objects.filter(node=black_node) # # Création des synonymes # In[13]: syno_type = NodeType.objects.get(name='Synonyme') syno_node = Node.objects.create(name='Syno Pubmed', user=user, parent=corpus, type=syno_type) # In[23]: synonyme1, synonyme2 = Node_Ngram.objects.filter(node=white_node)[3:5] # In[24]: NodeNgramNgram.objects.create(node=syno_node, ngramX=synonyme1.ngram, ngramY=synonyme2.ngram) # # Cooccurrence # In[65]: white_node.children.count() # In[66]: black_node.pk # In[67]: try: cooc_type = NodeType.objects.get(name='Cooccurrence') except: cooc_type = NodeType(name='Cooccurrence') cooc_type.save() # In[68]: cooc = Node.objects.create(user=me, parent=corpus_pubmed, type=cooc_type, name="Cooccurrences calcul Alpha") # In[69]: cooc.pk # In[152]: white_node.children.all().delete() # In[70]: from django.db import connection cursor = connection.cursor() # LOCK TABLE documents_ngramtemporary IN EXCLUSIVE MODE; query_string = """ INSERT INTO node_nodengramngram (node_id, "ngramX_id", "ngramY_id", score) SELECT %d as node_id, x.ngram_id, y.ngram_id, COUNT(*) AS score FROM node_node_ngram AS x INNER JOIN node_node_ngram AS y ON x.node_id = y.node_id WHERE x.id in (select id from node_node_ngram WHERE node_id = %d ) AND y.id in (select id from node_node_ngram WHERE node_id = %d ) AND x.ngram_id <> y.ngram_id GROUP BY x.ngram_id, y.ngram_id HAVING count(*) > 1 ORDER BY score LIMIT 300 """ % (cooc.pk, white_node.pk, white_node.pk) cursor.execute(query_string) try: while True: row = cursor.fetchone() if row is None: break print(row) except: pass # In[1]: # In[45]: # In[71]: from copy import copy import numpy as np import pandas as pd import networkx as nx from collections import defaultdict from analysis.louvain import * import matplotlib.pyplot as plt get_ipython().magic('matplotlib inline') # In[ ]: matrix = "" # In[72]: matrix = defaultdict(lambda : defaultdict(float)) for cooccurrence in NodeNgramNgram.objects.filter(node=cooc): if cooccurrence.score > 1 : #print(x.ngramX.terms, x.ngramY.terms) matrix[cooccurrence.ngramX.terms][cooccurrence.ngramY.terms] = cooccurrence.score matrix[cooccurrence.ngramY.terms][cooccurrence.ngramX.terms] = cooccurrence.score # In[73]: df = pd.DataFrame(matrix).T.fillna(0) x = copy(df.values) # In[74]: x = np.where((x.sum(axis=1) > x.shape[0] / 2), 0, x ) x = np.where((x.sum(axis=1) > x.shape[0] / 10), 0, x ) # In[75]: x = x / x.sum(axis=1) # In[76]: matrix_filtered = np.where(x > .4, 1, 0) # In[77]: matrix_filtered # In[78]: G = nx.from_numpy_matrix(matrix_filtered) G = nx.relabel_nodes(G, dict(enumerate(df.columns))) # In[79]: nx.draw(G, with_labels=True) plt.show() # In[80]: partition = best_partition(G) # In[ ]: #partition # In[81]: pos = nx.spring_layout(G) # In[82]: count = 0.0 node_min = 3 for com in set(partition.values()) : count = count + 1 list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == com] if len(list_nodes) > node_min: nx.draw_networkx_nodes(G, pos, list_nodes, node_size = 20, with_labels=True)#, node_color = str(count / size)) nx.draw_networkx_edges(G, pos, alpha=0.5) plt.title("Clique " + str(count)) for node in list_nodes: print(node) plt.show() print("-" * 30) # In[ ]: # In[ ]: