Merge branch 'romain' into unstable

bcb68c69 · delanoe · 6f0c86f8 · 8b76ac19 · bcb68c69 · bcb68c69
Commit bcb68c69 authored Dec 18, 2015 by delanoe
Hide whitespace changes
Inline Side-by-side

Showing with 311 additions and 88 deletions

urls.py annotations/urls.py +6 -0

views.py annotations/views.py +4 -1

importExport.py ngram/importExport.py +301 -87

No files found.
--- a/annotations/urls.py
+++ b/annotations/urls.py
@@ -2,9 +2,15 @@ from django.conf.urls import patterns, url
 from annotations import views


+# /!\ urls patterns here are *without* the trailing slash
+
 urlpatterns = patterns('',
+    # json:title,id,authors,journal,
+    #      publication_date
+    #      abstract_text,full_text
    url(r'^document/(?P<doc_id>[0-9]+)$', views.Document.as_view()), # document view
    url(r'^corpus/(?P<corpus_id>[0-9]+)/document/(?P<doc_id>[0-9]+)$', views.NgramList.as_view()), # the list associated with an ngram
    url(r'^lists/(?P<list_id>[0-9]+)/ngrams/(?P<ngram_ids>[0-9,\+]+)+$', views.NgramEdit.as_view()),
+    # POST (fixed 2015-12-16)
    url(r'^lists/(?P<list_id>[0-9]+)/ngrams/create$', views.NgramCreate.as_view()), #
 )
--- a/annotations/views.py
+++ b/annotations/views.py
@@ -13,7 +13,7 @@ from rest_framework.exceptions import APIException
 from rest_framework.authentication import SessionAuthentication, BasicAuthentication

 from node.models import Node
-from gargantext_web.db import session, cache, Node, NodeNgram
+from gargantext_web.db import session, cache, Node, NodeNgram, Ngram
 from ngram.lists import listIds, listNgramIds
 from gargantext_web.db import get_or_create_node

@@ -138,6 +138,8 @@ class NgramCreate(APIView):
    def post(self, request, list_id):
        """
        create NGram in a given list
+        
+        example: request.data = {'text': 'phylogeny'}
        """
        list_id = int(list_id)
        # format the ngram's text
@@ -161,6 +163,7 @@ class NgramCreate(APIView):
        ngram_id = ngram.id
        # create the new node_ngram relation
        # TODO check existing Node_Ngram ?
+        # £TODO ici indexation
        node_ngram = NodeNgram(node_id=list_id, ngram_id=ngram_id, weight=1.0)
        session.add(node_ngram)
        session.commit()

--- a/ngram/importExport.py
+++ b/ngram/importExport.py
-import re
-from admin.utils import PrintException
+"""
+Import and export all lists from a corpus node

-from gargantext_web.db import Node, Ngram, NodeNgram, NodeNodeNgram, NodeNgramNgram
+
+TODO : FEAT GROUPED ITEMS ARE NOT HANDLED (synonyms)
+            =======
+
+TODO : REFACTOR 1) split list logic from corpus logic
+                    => possibility to act on one list 
+
+TODO : REFACTOR 2) improvements in ngram creation (?bulk like node_ngram links)
+"""
+
+from gargantext_web.db import Ngram, NodeNgram, NodeNodeNgram
 from gargantext_web.db import cache, session, get_or_create_node, bulk_insert

-import sqlalchemy as sa
-from sqlalchemy.sql import func
-from sqlalchemy import desc, asc, or_, and_, Date, cast, select
-from sqlalchemy import literal_column
-from sqlalchemy.orm import aliased
+# import sqlalchemy as sa
+from sqlalchemy.sql import func, exists
+# from sqlalchemy import desc, asc, or_, and_, Date, cast, select
+#from sqlalchemy import literal_column
+#from sqlalchemy.orm import aliased

-from ngram.tools import insert_ngrams
-from analysis.lists import WeightedList, UnweightedList
+#from ngram.tools import insert_ngrams
+#from analysis.lists import WeightedList, UnweightedList

 from collections import defaultdict
 from csv import writer, reader, QUOTE_MINIMAL


-def get_id(ngram_terms):
-    query = session.query(Ngram.id).filter(Ngram.terms==ngram_terms).first()
-    return(query)
-
-
-def exportNgramList(node,filename,delimiter="\t"):
+def exportNgramLists(node,filename,delimiter="\t"):
+    """
+    export des 3 listes associées à un node corpus
+           en combinaison locale avec les groupements
+    """
+    
+    # the node arg has to be a corpus here
+    if not hasattr(node, "type_id") or node.type_id != 4:
+        raise TypeError("EXPORT: node argument must be a Corpus Node")
    
    # les nodes couvrant les listes
    # -----------------------------
@@ -36,14 +49,27 @@ def exportNgramList(node,filename,delimiter="\t"):
    # ------------------------------------
    #~~ contenu: liste des ids [2562,...]
    stop_ngram_ids  = [stop_ngram.ngram_id for stop_ngram in stop_node.node_node_ngram_collection]
+    
    # idem pour miam et map
    miam_ngram_ids  = [miam_ng.ngram_id for miam_ng in miam_node.node_node_ngram_collection]
    map_ngram_ids   = [map_ng.ngram_id for map_ng in map_node.node_node_ngram_collection]
+    
+    
+    # pour debug ---------->8 --------------------
+    #~ stop_ngram_ids = stop_ngram_ids[0:10]
+    #~ miam_ngram_ids = stop_ngram_ids[0:10]
+    #~ map_ngram_ids = map_ngram_ids[0:10]
+    # --------------------->8 --------------------

    # pour la group_list on a des couples de ngram_ids
    # -------------------
    # ex: [(3544, 2353), (2787, 4032), ...]
    group_ngram_id_couples = [(nd_ng_ng.ngramx_id,nd_ng_ng.ngramy_id) for nd_ng_ng in group_node.node_nodengramngram_collection]
+    
+    
+    # pour debug
+    # group_ngram_id_couples = []
+    

    # k couples comme set 
    # --------------------
@@ -57,7 +83,7 @@ def exportNgramList(node,filename,delimiter="\t"):
    
    
    # helper func
-    def ngrams_to_csv_rows(ngram_ids, id_groupings={}, list_type=7):
+    def ngrams_to_csv_rows(ngram_ids, id_groupings={}, list_type=0):
        """
        Table d'infos basiques par ngram :
          (ng_id, forme du terme, poids, type_de_liste)
@@ -72,9 +98,17 @@ def exportNgramList(node,filename,delimiter="\t"):
                 ]
        
        (ensuite par exemple csv.writer.writerows(csv_rows)
+        
+        list_type ici:
+          0  <=> stopList
+          1  <=> miamList
+          2  <=> mapList
        """
        # récupérer d'un coup les objets Ngram (avec terme)
-        ng_objs = session.query(Ngram).filter(Ngram.id.in_(ngram_ids)).all()
+        if len(ngram_ids):
+            ng_objs = session.query(Ngram).filter(Ngram.id.in_(ngram_ids)).all()
+        else:
+            ng_objs = []
        
        # les transcrire en tableau (liste de listes)
        csv_rows = list()
@@ -88,7 +122,7 @@ def exportNgramList(node,filename,delimiter="\t"):
                this_grouped = ""
            
            # transcription : 5 colonnes
-            # ID , terme , n , type_de_liste , gid|gid|gid
+            # ID , terme , n , type_de_liste , grouped_id|grouped_id...
            
            csv_rows.append(
                  [ng_id,ng_obj.terms,ng_obj.n,list_type,this_grouped]
@@ -130,34 +164,78 @@ def exportNgramList(node,filename,delimiter="\t"):
    


-def importNgramList(node,filename,delimiter="\t",modify_lists=[0,1,2]):
+def importNgramLists(node,filename,delimiter="\t", del_lists=[]):
    '''
    Suppose une table CSV avec colonnes comme dans fonction export.
    
-    /!\ efface et remplace les listes existantes  /!\
-    /!\ (supprime leur collection de NodeNgrams)  /!\    
+    
+    del_lists : int[]
+    
+    /!\  si del_lists contient un ou plusieurs    /!\
+    /!\  types de listes (array parmi [0,1,2])    /!\
+    /!\ on efface et remplace la liste existante  /!\
+    /!\ (supprime leur collection de NodeNgrams)  /!\
+    
+    par exemple 
+    del_lists = [0,1] => effacera la stopList (aka 0)
+                               et la miamList (aka 1)
+                         mais pas la mapList (aka 2)
+    
+    
+    TODO: 
+      - import "group cliques joining" from rest_v1_0.ngrams.Group
+        (and ideally add its logic to analysis.lists.Translations)
    
    '''
    
-    list_types_shortcuts = {
-        0: "StopList",
-        1: "MiamList",
-        2: "MapList",
-    }
+    # the node arg has to be a corpus here
+    if not hasattr(node, "type_id") or node.type_id != 4:
+        raise TypeError("IMPORT: node argument must be a Corpus Node")
    
-    # on supprime tous les NodeNgrams des listes à modifier
-    # ------------------------------------------------------
-#    for list_shortcut in modify_lists:
-#        # find previous listnode id
-#        list_type = list_types_shortcuts[list_shortcut]
-#        list_node = get_or_create_node(nodetype=list_type, corpus=node)
-#        node_id = listnode.id
-#        
-#        # delete previous lists
-#        session.query(NodeNgram).filter(NodeNgram.node_id==list_node.id).delete()
-#        session.commit()
+    # for stats
+    added_nd_ng = 0   # number of added list elements
+    added_ng = 0      # number of added unknown ngrams
+    
+    
+    # our list shortcuts will be 0,1,2
+    our_ls = [
+       {'name':"StopList", 'weight':-1.0,  'node': None,   'add_data':[]},
+       {'name':"MiamList", 'weight':1.0,   'node': None,   'add_data':[]},
+       {'name':"MapList",  'weight':2.0,   'node': None,   'add_data':[]}
+        #   ^^^^^^^^^^^^^^^^^^^^^^^^^       ^^^^^^^^^^      ^^^^^^^^^^
+        #        paramètres                  "cibles"        résultats
+    ]
+    
+    # on mettra dans add_data les termes avec le ngram_id retrouvé/créé
+    
+    # find previous list node objects
+    # (les 3 listes où on va écrire)
+    for ltype in [0,1,2]:
+        our_ls[ltype]['node'] = get_or_create_node(
+                                   nodetype=our_ls[ltype]['name'], 
+                                   corpus=node
+                                )
+        
+    # si del_lists, on supprime tous les NodeNgrams des listes
+    # --------------------------------------------------------
+    for ltype in del_lists:
+        this_list_id = our_ls[ltype]['node'].id
+        
+        # DELETE contents of previous lists
+        session.query(NodeNgram).filter(NodeNgram.node_id==this_list_id).delete()
+        session.commit()
+        # todo garbage collect terms ?
+    
+    
+    
+    # also find group node
+    group = get_or_create_node(nodetype='Group', corpus=node)
    
+    # it will be fusionned at the end with the imported_groups dict
+    imported_groups = defaultdict(set)
    
+    
+    # --------------
    # on lit le CSV
    # --------------
    ngrams_csv_rows = []
@@ -167,76 +245,212 @@ def importNgramList(node,filename,delimiter="\t",modify_lists=[0,1,2]):
                                 delimiter = delimiter,
                                 quoting   = QUOTE_MINIMAL
                                 )
-    
-        all_read_terms = list()
-        map_terms = set()
-        for csv_row in ngrams_csv_rows:
+        
+        # vérifications initiales (version naïve terme par terme)
+        #   ==> existence ?
+        #       sinon création ngram
+        #   ==> stockage dans add_data pour bulk_insert
+        for i, csv_row in enumerate(ngrams_csv_rows):
            this_ng_id           = csv_row[0]
            this_ng_terms        = csv_row[1]
-            this_ng_nlen         = csv_row[2]
-            this_ng_list_type_id = csv_row[3]
-            this_ng_grouped_ngs  = csv_row[4]
+            this_ng_nlen         = int(csv_row[2])
+            this_ltype           = int(csv_row[3])
+            this_ng_group        = csv_row[4]
            
-            if this_ng_list_type_id == str(2):
-                map_terms.add(this_ng_terms)
-
-            # --- quelle liste cible ?
+            # --- vérif terme
+            if not len(this_ng_terms) > 0:
+                print("WARNING: (skip line) empty term at CSV %s:l.%i" % (filename, i))
+                continue
            
-            # par ex: "MiamList"
-            #list_type = type_ids_cache[this_ng_list_type_id]
+            # === quelle liste cible ?
+            if this_ltype in [0,1,2]:
+                # par ex: "MiamList"
+                list_type = our_ls[this_ltype]['name']
+                tgt_list_node = our_ls[this_ltype]['node']
+            else:
+                print("WARNING: (skip line) wrong list_type at CSV %s:l.%i" % (filename, i))
+                continue
            
-            #tgt_list_node = get_or_create_node(nodetype=list_type, corpus=node)
-                
            
-            # --- test 1: forme existante dans node_ngram ?
+            print("IMPORT '%s' >> %s" % (this_ng_terms,list_type))
            
-            #preexisting = session.query(Ngram).filter(Ngram.terms == this_ng_terms).first()
+            # --- test 1: forme existante dans node_ngram ?
            
-            #if preexisting is None:
-            #   # todo ajouter Ngram dans la table node_ngram
-                #      avec un nouvel ID
+            preexisting = session.query(Ngram).filter(Ngram.terms == this_ng_terms).first()
            
+            if preexisting is None:
+                # ajout ngram dans la table node_ngram
+                new_ng = Ngram(terms = this_ng_terms,
+                                  n  = this_ng_nlen)
+                
+                # INSERT INTO node_ngram
+                # ======================
+                session.add(new_ng)
+                session.commit()
+                added_ng += 1
+                
+                # avec un nouvel ID
+                our_ls[ltype]['add_data'].append(
+                          [tgt_list_node.id, new_ng.id, our_ls[ltype]['weight']]
+                         )
+                
+                # £TODO ici indexation dans les docs
+                # => Occurrences
+                # node_ngram = NodeNgram(node_id=list_id, ngram_id=ngram_id, weight=1.0)
            
-            # --- test 2: forme déjà dans une liste ?
            
-            #if preexisting is not None:
-            #    # premier node de type "liste" mentionnant ce ngram_id
-            #    #
-            #    node_ngram = preexisting.node_node_ngram_collection[0]
-            #    previous_list = node_ngram.node_id
-            #
+            # cas ngram existant
+            else:
+                add_ng_id = preexisting.id
+                
+                # --- test 2: forme déjà dans la même liste ? 
+                # (sauf si delete)
+                if not this_ltype in del_lists:
+                    # méthode traditionnelle
+                    # session.query(NodeNgram)
+                    #    .filter(NodeNgram.node_id == my_miam.id)
+                    #    .filter(NodeNgram.ngram_id == preexisting.id)
+                    
+                    
+                    # méthode avec exists() (car on n'a pas besoin de récupérer l'objet)
+                    already_flag = session.query(
+                           exists().where(
+                              (NodeNgram.node_id == tgt_list_node.id) 
+                              & (NodeNgram.ngram_id == preexisting.id)
+                            )
+                         ).scalar()
+                    
+                    if already_flag:
+                        print("INFO: (skip line) already got %s in this list %s" %(this_ng_terms, list_type))
+                        continue
+                    
+                    # --- TODO test 3 : forme dans une autre liste ?
+                    #    par ex: conflit SI forme dans stop ET ajoutée à map
+                
+                    else:
+                        # append to results
+                        our_ls[ltype]['add_data'].append(
+                            [tgt_list_node.id, preexisting.id, our_ls[ltype]['weight']]
+                         )
+                
+                # si c'est une liste à effacer on ajoute toujours
+                else:
+                    # append to results
+                    our_ls[ltype]['add_data'].append(
+                        [tgt_list_node.id, preexisting.id, our_ls[ltype]['weight']]
+                     )
+        
            
-            # ---------------
+            # --- TODO éléments groupés
            
-            #data[0] = tgt_list_node.id
-            #data[1] = this_ng_id          # on suppose le même ngram_id
-            #data[2] = 
+            # grouped synonyms set (if any)
+            if len(this_ng_group) != 0:
+                imported_groups[this_ng_id] = set(
+                    [int(ng_id) for ng_id in this_ng_group.split('|')]
+                    )
+    
+    
+    # INSERT INTO node_node_ngram
+    # ============================
+    for list_type in [0,1,2]:
+        bulk_insert(
+           NodeNgram, 
+           ['node_id', 'ngram_id', 'weight'],
+           [d for d in our_ls[list_type]['add_data']]
+        )
        
-    map_node = get_or_create_node(corpus=node, nodetype='MapList')
-    session.query(NodeNgram).filter(NodeNgram.node_id==map_node.id).delete()
-    map_id_terms = (session.query(Ngram.id, Ngram.terms)
-                           .filter(Ngram.terms.in_(list(map_terms)))
-                           .all()
-                           )
-
-    data = [(map_node.id, ngram[0], 1) for ngram in map_id_terms]
+        added_nd_ng += len(our_ls[list_type]['add_data'])
+    
+    # synonyms set unions
+    #
+    # original arcs (directed couples)
+    old_arcs = session.query(NodeNgramNgram.ngramx_id, NodeNgramNgram.ngramy_id).filter(NodeNgramNgram.node_id == group.id).all()
    
-    bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
+    # TODO groupes: correspondance entre les IDS_source et les nouveaux IDS
    
+    # TODO groupes: factoriser le code de fusion de groupes
+    #               depuis rest_v1_0.ngrams.Group.get
+    #               ou la remplacer par une agrégation sql + sets
+    #               cf. long commentaire en bas
    
-    # bulk_insert(NodeNgramNgram, ['node_id', 'ngramx_id', 'ngramy_id', 'weight'], [d for d in data])
+    # INSERT INTO node_nodengramngram
+    # ===============================
    
+    print("INFO: added %i elements in the lists indices" % added_nd_ng)
+    print("INFO: added %i new ngrams in the lexicon" % added_ng)
    
-    # lecture des ngrams préexistants
-    # ------------------



-# Remarque quand on a un list_node li alors faire:
-#     li.node_node_ngram_collection 
-#  (donne tous les node_ngram)
-#  (plus rapide que lancer une nouvelle session.query)
-# 
-# TODO utiliser carrément :
+
+
+# à chronométrer:
 # [w.node_ngram for w in listnode.node_node_ngram_collection]

+
+
+
+
+
+##################################
+#    essais fusion de groupes
+##################################
+# # tentative pour refaire le code de Samuel (dans rest_v1_0.ngrams.Group.get)
+# # qui fait les cliques de synonymes, directement en sql
+# 
+# select ngramx_id as root, ngramy_id as kid 
+#  into temporary tempo_1 
+#  from node_nodengramngram 
+#  where node_id = 199 
+#  and ngramx_id != ngramy_id ;
+#  
+# --  root | kid  
+# -- ------+------
+# --  3447 | 3443
+# --  3456 | 3462
+# --  3455 | 3462
+# --  3455 | 3456
+# --  3441 | 3448
+# --  3452 | 3446
+# --  3452 | 3444
+# 
+# puis parcours récursif cf http://stackoverflow.com/questions/28758058/
+# 
+# with recursive mes_cliques as (
+#   select root as root_id, root, kid
+#   from tempo_1
+#   union all
+#   select p.root_id, c.root, c.kid
+#   from tempo_1 as c
+#     join mes_cliques p on p.kid = c.root
+# )
+# select root_id, array_agg(kid) as edges_in_group
+# from mes_cliques
+# group by root_id;
+# 
+# RESULTAT
+# -- root_id |  edges_in_group  
+# -- --------+------------------
+# --    3441 | {3448}
+# --    3456 | {3462}
+# --    3452 | {3446,3444}
+# --    3447 | {3443}
+# --    3455 | {3462,3456,3462}
+# 
+# 
+
+# # autre résultat plus direct avec agrégat simple
+# # -----------------------------------------------
+# select ngramx_id as root, array_agg(ngramy_id) as kids 
+#  from node_nodengramngram
+#  where node_id = 199
+#  and ngramx_id != ngramy_id
+# group by ngramx_id ;
+# 
+# --  root |    kids     
+# -- ------+-------------
+# --  3441 | {3448}
+# --  3452 | {3446,3444}
+# --  3455 | {3462,3456}
+# --  3447 | {3443}
+# --  3456 | {3462}