[FEAT] Adding some specific monograms to the maplist.

2b07ba58 · delanoe · 4f12f4bc · 2b07ba58 · 2b07ba58 · 2b07ba58
Commit 2b07ba58 authored Nov 16, 2015 by delanoe
Hide whitespace changes
Inline Side-by-side

Showing with 53 additions and 63 deletions

cooccurrences.py analysis/cooccurrences.py +1 -2

functions.py analysis/functions.py +31 -23

mapList.py ngram/mapList.py +21 -38

No files found.
--- a/analysis/cooccurrences.py
+++ b/analysis/cooccurrences.py
@@ -14,7 +14,7 @@ def do_cooc(corpus=None
         , field1='ngrams', field2='ngrams'
         , miam_id=None, stop_id=None, group_id=None
         , cvalue_id=None
-         , n_min=2, n_max=None
+         , n_min=1, n_max=None
         , start=None, end=None
         , limit=1000
         , isMonopartite=True
@@ -62,7 +62,6 @@ def do_cooc(corpus=None
    session.commit()
    # END
    session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id==node_cooc.id).delete()
    session.commit()

--- a/analysis/functions.py
+++ b/analysis/functions.py
@@ -59,17 +59,25 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
    #xo = diag_null(x)
    #y = diag_null(y)
+    distance = 'conditional'
-    x = x / x.sum(axis=1)
+    if distance == 'conditional':
-    y = y / y.sum(axis=0)
+        x = x / x.sum(axis=1)
+        y = y / y.sum(axis=0)
-    xs = x.sum(axis=1) - x
+        xs = x.sum(axis=1) - x
-    ys = x.sum(axis=0) - x
+        ys = x.sum(axis=0) - x
-    # top inclus ou exclus
-    n = ( xs + ys) / (2 * (x.shape[0] - 1))
+        # top inclus ou exclus
-    # top generic or specific
+        n = ( xs + ys) / (2 * (x.shape[0] - 1))
-    m = ( xs - ys) / (2 * (x.shape[0] - 1))
+        # top generic or specific
+        m = ( xs - ys) / (2 * (x.shape[0] - 1))
+    elif distance == 'cosine':
+        xs = x / np.sqrt((x**2).sum(axis=1) * (x**2).sum(axis=0))
+        n = np.max(xs.sum(axis=1))
+        m = np.min(xs.sum(axis=1))
    n = n.sort(inplace=False)
    m = m.sort(inplace=False)
@@ -110,21 +118,21 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
    def getWeight(item):
        return item[1]
+#    
-    node_degree = sorted(G.degree().items(), key=getWeight, reverse=True)
+#    node_degree = sorted(G.degree().items(), key=getWeight, reverse=True)
-    #print(node_degree)
+#    #print(node_degree)
-    nodes_too_connected = [n[0] for n in node_degree[0:(round(len(node_degree)/5))]]
+#    nodes_too_connected = [n[0] for n in node_degree[0:(round(len(node_degree)/5))]]
+#
-    for n in nodes_too_connected:
+#    for n in nodes_too_connected:
-        n_edges = list()
+#        n_edges = list()
-        for v in nx.neighbors(G,n):
+#        for v in nx.neighbors(G,n):
-            #print((n, v), G[n][v]['weight'], ":", (v,n), G[v][n]['weight'])
+#            #print((n, v), G[n][v]['weight'], ":", (v,n), G[v][n]['weight'])
-            n_edges.append(((n, v), G[n][v]['weight']))
+#            n_edges.append(((n, v), G[n][v]['weight']))
+#
-        n_edges_sorted = sorted(n_edges, key=getWeight, reverse=True)
+#        n_edges_sorted = sorted(n_edges, key=getWeight, reverse=True)
-        #G.remove_edges_from([ e[0] for e in n_edges_sorted[round(len(n_edges_sorted)/2):]])
+#        #G.remove_edges_from([ e[0] for e in n_edges_sorted[round(len(n_edges_sorted)/2):]])
-        #G.remove_edges_from([ e[0] for e in n_edges_sorted[(round(len(nx.neighbors(G,n))/3)):]])
+#        #G.remove_edges_from([ e[0] for e in n_edges_sorted[(round(len(nx.neighbors(G,n))/3)):]])
-        G.remove_edges_from([ e[0] for e in n_edges_sorted[10:]])
+#        G.remove_edges_from([ e[0] for e in n_edges_sorted[10:]])
    G.remove_nodes_from(nx.isolates(G))
    partition = best_partition(G.to_undirected())

--- a/ngram/mapList.py
+++ b/ngram/mapList.py
@@ -15,10 +15,15 @@ from sqlalchemy.orm import aliased
 from ngram.tools import insert_ngrams
 import csv
-def compute_mapList(corpus,limit=500):
+def compute_mapList(corpus,limit=500,n=1):
    '''
    According to Specificities and stoplist,
    '''
+    monograms_part = 0.005
+    monograms_limit = round(limit * monograms_part)
+    multigrams_limit = limit - monograms_limit
    dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
    node_miam  = get_or_create_node(nodetype='MiamList', corpus=corpus)
@@ -33,18 +38,30 @@ def compute_mapList(corpus,limit=500):
    Spec=aliased(NodeNodeNgram)
-    top_ngrams = (session.query(Spec.ngram_id, Spec.score)
+    query = (session.query(Spec.ngram_id, Spec.score)
                .join(Miam, Spec.ngram_id == Miam.ngram_id)
+                .join(Ngram, Ngram.id == Spec.ngram_id)
                #.outerjoin(Group, Group.ngramy_id == Spec.ngram_id)
                #.outerjoin(Stop, Stop.ngram_id == Spec.ngram_id)
                .filter(Miam.node_id == node_miam.id)
                #.filter(Group.node_id == node_group.id)
                #.filter(Stop.node_id == node_stop.id)
                .filter(Spec.nodex_id == node_spec.id)
+            )
+    top_monograms = (query
+                .filter(Ngram.n == 1)
+                .order_by(desc(Spec.score))
+                .limit(monograms_limit)
+               )
+    top_multigrams = (query
+                .filter(Ngram.n >= 2)
                .order_by(desc(Spec.score))
-                .limit(limit)
+                .limit(multigrams_limit)
               )
    #print([t for t in top_ngrams])
    node_mapList = get_or_create_node(nodetype='MapList', corpus=corpus)
@@ -53,7 +70,7 @@ def compute_mapList(corpus,limit=500):
    data = zip(
        [node_mapList.id for i in range(1,limit)]
-        , [n[0] for n in top_ngrams]
+        , [n[0] for n in list(top_multigrams) + list(top_monograms)]
        , [1 for i in range(1,limit)]
    )
    #print([d for d in data])
@@ -100,37 +117,3 @@ def insert_miam(corpus, ngrams=None, path_file_csv=None):
 #compute_mapList(corpus)
 #insert_miam(corpus=corpus, path_file_csv="Thesaurus_tag.csv")
-#def getNgrams(corpus=None, limit_inf=600, limit_sup=3000):
-#    '''
-#    getNgrams :: Corpus -> [(Int, String)] -> [(Int, String)]
-#    For a corpus, gives list of highest Cvalue ngrams and highest TFIDF (global)
-#    ngrams that have to be grouped with
-#    '''
-#    #tfidf_node = get_or_create_node(nodetype='Tfidf (global)', corpus=corpus)
-#    cvalue_node = get_or_create_node(nodetype='Cvalue', corpus=corpus)
-#    spec_node = get_or_create_node(nodetype='Specificity', corpus=corpus)
-#
-#
-#    #tfidf_ngrams  = queryNodeNodeNgram(nodeMeasure_id=tfidf_node.id, corpus_id=corpus.id)
-#    cvalue_ngrams = queryNodeNodeNgram(nodeMeasure_id=cvalue_node.id, corpus_id=corpus.id, limit=limit_sup)
-#    spec_ngrams   = queryNodeNodeNgram(nodeMeasure_id=spec_node.id, corpus_id=corpus.id, limit=limit_inf)
-#
-#    #print([n for n in tfidf_ngrams])
-#
-#    def list2set(_list):
-#        _set = set()
-#        for n in _list:
-#            _set.add((n[0],n[1]))
-#        return(_set)
-#
-#    cvalue_set = set()
-#    spec_set = set()
-#
-#    cvalue_set = list2set(cvalue_ngrams)
-#    spec_set   = list2set(spec_ngrams)
-#
-#    cvalue_setDiff = cvalue_set.difference(spec_set)
-#
-#    return(spec_set,cvalue_setDiff)
-#