[FEAT] Generic cooccurrence function with miam_id, stop_id, group_id.

b33f37eb · delanoe · 38556c56 · b33f37eb · b33f37eb · b33f37eb
Commit b33f37eb authored Sep 29, 2015 by delanoe
Showing with 123 additions and 200 deletions

cooccurrences.py analysis/cooccurrences.py +107 -86

db.py gargantext_web/db.py +1 -0

specificity.py ngram/specificity.py +13 -112

workflow.py ngram/workflow.py +2 -2

No files found.
--- a/analysis/cooccurrences.py
+++ b/analysis/cooccurrences.py
-from env import *
-
-from admin.utils import PrintException
-
-from gargantext_web.db import NodeNgram
-from gargantext_web.db import *
-from parsing.corpustools import *
-
-import sqlalchemy
-from sqlalchemy.sql import func
 from sqlalchemy import desc, asc, or_, and_, Date, cast, select
 from sqlalchemy import literal_column
 from sqlalchemy.orm import aliased
+from sqlalchemy.sql import func
+
+from gargantext_web.db import Node, NodeNgram, NodeNgramNgram, NodeNodeNgram, NodeHyperdata, Hyperdata
+from gargantext_web.db import session, cache, get_or_create_node, bulk_insert
+from analysis.lists import WeightedMatrix, UnweightedList, Translations

-# from gargantext_web.db import Node, get_cursor
+def cooc(corpus=None
+         , miam_id=None, stop_id=None, group_id=None
+         , start=None, end=None
+         , limit=1000):
+    '''
+    Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
+    For the moment list of paramters are not supported because, lists need to
+    be merged before.
+    corpus :: Corpus
+    miam_id :: Int
+    stop_id :: Int
+    group_id :: Int
+
+    For the moment, start and ens are simple, only year is implemented yet
+    start :: TimeStamp -- example: '2010-05-30 02:00:00+02'
+    end   :: TimeStamp
+    limit :: Int

-def cooccurrences(user_id=None, corpus_id=None,
-                mainlist_id=None, stoplist_id=None,
-                lem=False, stem=True, cvalue=False,
-                date_begin=None, date_end=None,
-                size=10, n_min=2, n_max=3):
    '''
-    Function to create a cooccurrence Node
-    ---------------------------------------------------
-    cooccurrences :: [Text] -> [Word] -> [[Word]]
+    node_cooc = get_or_create_node(nodetype='Cooccurrence', corpus=corpus
+                                   , name_str="Cooccurrences corpus " + str(corpus.id) + "list_id: " + str(miam_id)
+                                   )

-    user_id      :: Integer, User.id who creates the cooccurrence matrix
-    corpus_id    :: Integer, Node.id with NodeType "Corpus"
+# TODO : save parameters in Node
+#    args, _, _, parameters = inspect.getargvalues(inspect.currentframe())
+#    print(parameters)
+#    for parameter in parameters.keys():
+#        print(parameters[parameter])
+#        node_cooc.hyperdata[parameter] = parameters[parameter]
+#
+#    session.add(node_cooc)
+#    session.commit()
+#    print(node_cooc.hyperdata)
+
+    session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==node_cooc.id).delete()
+    session.commit()

-    miamlist_id  :: Integer, Node.id with NodeType "MiamList" and with parent_id=corpus_id
-    stoplist_id  :: Integer, Node.id with NodeType "StopList" and with parent_id=corpus_id
-    mainlist_id  :: Integer, Node.id with NodeType "MainList" and with parent_id=corpus_id
+    NodeNgramX = aliased(NodeNgram)
+    NodeNgramY = aliased(NodeNgram)

-    lem          :: False | True, if lemmatization  should be taken into account
-    stem         :: False | True, if stemmatization should be taken into account
-    cvalue       :: False | True, if cvalue         should be taken into account
-    group        :: False | True, if manual groups  should be taken into account
+    doc_id = cache.NodeType['Document'].id

-    date_begin   :: Datetime, format YYYY-MM-DD, begin of corpus splitted by date
-    date_end     :: Datetime, format YYYY-MM-DD, end   of corpus splitted by date
+    cooc_query = (session.query(NodeNgramX.ngram_id, NodeNgramY.ngram_id, func.count())
+             .join(Node, Node.id == NodeNgramX.node_id)
+             .join(NodeNgramY, NodeNgramY.node_id == Node.id)
+                )

-    size         :: Integer, size of the cooccurrence list
-    n_min        :: Integer, minimal ngram's size of n
-    n_max        :: Integer, maximal ngram's size of n
-    '''

-    # We create a new node of Type cooccurrence
-    if corpus_id is not None and user_id is not None:
-        node_cooc = session.query(Node).filter(
-                                Node.parent_id==corpus.id,
-                                Node.type_id == cache.NodeType['Cooccurrence'].id
-                                ).first()
-        if node_cooc is None:
-            node_cooc = Node(user_id = user_id,
-                             parent_id=corpus_id,
-                             type_id=cache.NodeType['Cooccurrence'].id,
-                             name="Cooccurrences corpus " + str(corpus_id))
-
-            session.add(node_cooc)
-            session.commit()
-    else:
-        print("Usage (Warning): Need corpus_id and user_id")
-
-    # Getting the main lists here, by default create or take the first one.
-
-    # Getting nodes for lems, stems and cvalue, if needed.
-    if stem is True:
-        node_stem = session.query(Node).filter(
-            Node.type_id==cache.NodeType['Stem'].id).first()
-
-    miamNgram   = aliased(NodeNgram)
-    stopNgram   = aliased(NodeNgram)
-    groupNgram   = aliased(NodeNgramNgram)
-
-    stemNgram   = aliased(NodeNgramNgram)
-    lemNgram    = aliased(NodeNgramNgram)
-    cvalueNgram = aliased(NodeNgramNgram)
-
-
-    # Literal query here
-    query = (session.query(Node.id, Ngram.id.label('x'), Ngram.id.label('y'), func.count().label('score'))
-        .join(NodeNgram, NodeNgram.node_id == Node.id)
-        #.outerjoin(stopNgram, stopNgram.ngram_id == Ngram.id)
-        .filter(Node.parent_id == corpus_id)
-        .filter(Node.type_id == cache.NodeType['Document'].id)
-        #.filter(Ngram.n > n_max)
-        #.group_by(x)
-        #.group_by(y)
-        #.limit(size)
-        .all()
+    if start is not None:
+        Start=aliased(NodeHyperdata)
+        StartFormat = aliased(Hyperdata)
+        cooc_query = (cooc_query.join(Start, Start.node_id == Node.id)
+                                .join(StartFormat, StartFormat.id == Start.hyperdata_id)
+                                .filter(StartFormat.name == 'datetime')
+                                .filter(Start.value_datetime >= start)
+                      )
+
+
+    if end is not None:
+        End=aliased(NodeHyperdata)
+        EndFormat = aliased(Hyperdata)
+        cooc_query = (cooc_query.join(End, End.node_id == Node.id)
+                                .join(EndFormat, EndFormat.id == End.hyperdata_id)
+                                .filter(EndFormat.name == 'datetime')
+                                .filter(End.value_datetime <= end)
+                      )
+
+
+    cooc_query = (cooc_query.filter(Node.parent_id == corpus.id, Node.type_id == doc_id)
+             .filter(NodeNgramX.ngram_id < NodeNgramY.ngram_id)
+
+             .group_by(NodeNgramX.ngram_id, NodeNgramY.ngram_id)
+             .order_by(func.count())
+
+             .limit(limit)
+             )
+
+    matrix = WeightedMatrix(cooc_query)
+
+    if miam_id is not None :
+        #miam = get_or_create_node(nodetype='Cvalue', corpus=corpus)
+        miam_list = UnweightedList(session.query(NodeNodeNgram.ngram_id)
+                                   .filter(NodeNodeNgram.nodex_id == miam_id).all()
+                                   )
+
+    if stop_id is not None :
+        #stop = get_or_create_node(nodetype='StopList', corpus=corpus)
+        stop_list = UnweightedList(session.query(NodeNgram.ngram_id)
+                                   .filter(NodeNgram.node_id == stop_id).all()
+                                   )
+
+    if group_id is not None :
+        #group = get_or_create_node(nodetype='GroupList', corpus=corpus)
+        group_list = UnweightedList(session.query(NodeNgramNgram.ngramx_id, NodeNgramNgram.ngramy_id)
+                                   .filter(NodeNgramNgram.node_id == stop_id).all()
                                   )

-    return(query)
+    if miam_id is not None and stop_id is None and group_id is None:
+        cooc = (matrix & miam_list)
+    elif miam_id is not None and stop_id is not None and group_id is None :
+        cooc = (matrix & miam_list) - stop_list
+    elif miam_id is not None and stop_id is not None and group_id is not None :
+        cooc = (matrix & miam_list & group_list) - stop_list

+    cooc.save(node_cooc.id)
+    return(node_cooc.id)
--- a/gargantext_web/db.py
+++ b/gargantext_web/db.py
@@ -62,6 +62,7 @@ for model_name, model in models.__dict__.items():

 NodeNgram = Node_Ngram
 NodeResource = Node_Resource
+NodeHyperdata = Node_Hyperdata

 # manually declare the Node table...
 from datetime import datetime

--- a/ngram/specificity.py
+++ b/ngram/specificity.py
 #from admin.env import *
+import inspect
+
 from admin.utils import PrintException,DebugTime
 from django.db import connection, transaction

-from sqlalchemy import desc, asc, or_, and_, Date, cast, select
-from sqlalchemy import literal_column
-from sqlalchemy.orm import aliased
-from sqlalchemy.sql import func
-
-from gargantext_web.db import Node, NodeNgram, NodeNgramNgram, NodeNodeNgram
-from gargantext_web.db import session, cache, get_or_create_node, bulk_insert
-
 from collections import defaultdict

 import numpy as np
 import pandas as pd

-from analysis.lists import WeightedMatrix, UnweightedList
-
-
-def cooc(corpus=None, list_id=None, limit=1000):
-
-    node_cooc = get_or_create_node(nodetype='Cooccurrence', corpus=corpus
-                       , name_str="Cooccurrences corpus " + str(corpus.id) + "for list Cvalue" + str(list_id))
-
-    session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==node_cooc.id).delete()
-    session.commit()
-
-    NodeNgramX = aliased(NodeNgram)
-    NodeNgramY = aliased(NodeNgram)
-
-
-    doc_id = cache.NodeType['Document'].id
-
-    #literal_column(str(miam_id)).label("node_id"),
-    query = (session.query(NodeNgramX.ngram_id, NodeNgramY.ngram_id, func.count())
-             .join(Node, Node.id == NodeNgramX.node_id)
-             .join(NodeNgramY, NodeNgramY.node_id == Node.id)
-
-             .filter(Node.parent_id == corpus.id, Node.type_id == doc_id)
-             .filter(NodeNgramX.ngram_id < NodeNgramY.ngram_id)
-
-             .group_by(NodeNgramX.ngram_id, NodeNgramY.ngram_id)
-             .order_by(func.count())
-
-             .limit(limit)
-             )
-
-    cvalue_id = get_or_create_node(nodetype='Cvalue', corpus=corpus).id
-    stop_id = get_or_create_node(nodetype='StopList', corpus=corpus).id
-
-    cvalue_list = UnweightedList(session.query(NodeNodeNgram.ngram_id).filter(NodeNodeNgram.nodex_id==cvalue_id).all())
-    stop_list = UnweightedList(session.query(NodeNgram.ngram_id).filter(NodeNgram.node_id==stop_id).all())
-    matrix = WeightedMatrix(query)
+from analysis.cooccurrences import cooc
+from gargantext_web.db import session, cache, get_or_create_node, bulk_insert

-    cooc = matrix & cvalue_list - stop_list
-    cooc.save(node_cooc.id)
-    return(node_cooc.id)

-def coocOld(corpus=None, list_id=None, limit=100):
+def specificity(cooc_id=None, corpus=None):
    '''
-    cooc :: Corpus -> Int -> NodeNgramNgram
+    Compute the specificity, simple calculus.
    '''
-    cursor = connection.cursor()
-
-    node_cooc = get_or_create_node(nodetype='Cooccurrence', corpus=corpus
-                       , name_str="Cooccurrences corpus " + str(corpus.id) + "for list Cvalue" + str(list_id))
-
-    session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==node_cooc.id).delete()
-    session.commit()
-
-    query_cooc = """
-    INSERT INTO node_nodengramngram (node_id, "ngramx_id", "ngramy_id", score)
-        SELECT
-        %d as node_id,
-        ngX.id,
-        ngY.id,
-        COUNT(*) AS score
-    FROM
-        node_node AS n  -- the nodes who are direct children of the corpus
-
-    INNER JOIN
-        node_node_ngram AS nngX ON nngX.node_id = n.id  --  list of ngrams contained in the node
-    INNER JOIN
-        node_nodenodengram AS whitelistX ON whitelistX.ngram_id = nngX.ngram_id -- list of ngrams contained in the whitelist and in the node
-    INNER JOIN
-        node_ngram AS ngX ON ngX.id = whitelistX.ngram_id -- ngrams which are in both
-
-    INNER JOIN
-        node_node_ngram AS nngY ON nngY.node_id = n.id
-    INNER JOIN
-        node_nodenodengram AS whitelistY ON whitelistY.ngram_id = nngY.ngram_id
-    INNER JOIN
-        node_ngram AS ngY ON ngY.id = whitelistY.ngram_id
-
-    WHERE
-        n.parent_id = %s
-    AND
-        whitelistX.nodex_id = %s
-    AND
-        whitelistY.nodex_id = %s
-    AND
-        nngX.ngram_id < nngY.ngram_id   --  so we only get distinct pairs of ngrams
-
-    GROUP BY
-        ngX.id,
-        ngX.terms,
-        ngY.id,
-        ngY.terms
-
-    ORDER BY
-        score DESC
-    LIMIT
-        %d
-    """ % (node_cooc.id, corpus.id, list_id, list_id, limit)
-
-    # print(query_cooc)
-    cursor.execute(query_cooc)
-    return(node_cooc.id)
-
-def specificity(cooc_id=None, corpus=None):
-
    cooccurrences = session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==cooc_id).all()

    matrix = defaultdict(lambda : defaultdict(float))
@@ -149,20 +46,24 @@ def specificity(cooc_id=None, corpus=None):

    bulk_insert(NodeNodeNgram, ['nodex_id', 'nodey_id', 'ngram_id', 'score'], [d for d in data])

+    return(node.id)
+
 def compute_specificity(corpus,limit=100):
    '''
-    Computing specificities
+    Computing specificities as NodeNodeNgram.
+    All workflow is the following:
+        1) Compute the cooc matrix
+        2) Compute the specificity score, saving it in database, return its Node
    '''
    dbg = DebugTime('Corpus #%d - specificity' % corpus.id)

    list_cvalue = get_or_create_node(nodetype='Cvalue', corpus=corpus)
-    cooc_id = cooc(corpus=corpus, list_id=list_cvalue.id,limit=limit)
+    cooc_id = cooc(corpus=corpus, miam_id=list_cvalue.id,limit=limit)

    specificity(cooc_id=cooc_id,corpus=corpus)
    dbg.show('specificity')


 #corpus=session.query(Node).filter(Node.id==244250).first()
-#cooc2(corpus)
 #compute_specificity(corpus)

--- a/ngram/workflow.py
+++ b/ngram/workflow.py
@@ -13,8 +13,8 @@ def ngram_workflow(corpus):
    '''
    compute_tfidf(corpus)
    compute_tfidf_global(corpus)
-    compute_cvalue(corpus,limit=1000) # size
-    compute_specificity(corpus,limit=800)
+    compute_cvalue(corpus,limit=3000) # size
+    compute_specificity(corpus,limit=200)
 #    compute_stop(corpus)
    compute_groups(corpus,limit_inf=400, limit_sup=600)
 #    compute_miam(corpus,limit=100) # size