[Clean]

fce9da2e · delanoe · 72b8bdc4 · 72b8bdc4 · 72b8bdc4 · fce9da2e
Commit fce9da2e authored Jul 26, 2016 by delanoe
Showing with 1 addition and 149 deletions

__init__.py gargantext/util/crawlers/__init__.py +0 -23

metric_specificity.py gargantext/util/toolchain/metric_specificity.py +0 -125

ngrams_extraction.py gargantext/util/toolchain/ngrams_extraction.py +1 -1

No files found.
--- a/gargantext/util/crawlers/__init__.py
+++ b/gargantext/util/crawlers/__init__.py
-import importlib
-from gargantext.constants import RESOURCETYPES
-from gargantext.settings import DEBUG
-#if DEBUG: print("Loading available Crawlers")
-
-base_parser = "gargantext.util.crawlers"
-for resource in RESOURCETYPES:
-    if resource["crawler"] is not None:
-        try:
-            name =resource["crawler"]
-            #crawler is type basename+"Crawler"
-            filename = name.replace("Crawler", "").lower()
-            module = base_parser+".%s" %(filename)
-            importlib.import_module(module,name, locals(), globals())
-            #if DEBUG: print("\t-", name)
-        except Exception as e:
-            print("Check constants.py RESOURCETYPES declaration %s \nCRAWLER %s is not available for %s" %(str(e), resource["crawler"], resource["name"]))
-
-#initial import
-#from .cern import CernCrawler
-#from .istex import ISTexCrawler
-#from .pubmed import PubmedCrawler
-
--- a/gargantext/util/toolchain/metric_specificity.py
+++ b/gargantext/util/toolchain/metric_specificity.py
-"""
-Computes a specificity metric from the ngram cooccurrence matrix.
- + SAVE => WeightedList => NodeNgram
-"""
-from gargantext.models        import Node, Ngram, NodeNgram, NodeNgramNgram
-from gargantext.util.db       import session, aliased, func, bulk_insert
-from gargantext.util.lists    import WeightedList
-from collections              import defaultdict
-from pandas                   import DataFrame
-import pandas as pd
-
-def compute_specificity(corpus, cooc_id=None, cooc_matrix=None, overwrite_id = None):
-    '''
-    Compute the specificity, simple calculus.
-
-    Parameters:
-        - cooc_id: mandatory id of a cooccurrences node to use as base
-        - overwrite_id: optional preexisting specificity node to overwrite
-    '''
-
-    matrix = defaultdict(lambda : defaultdict(float))
-
-    if cooc_id == None and cooc_matrix == None:
-        raise TypeError("compute_specificity: needs a cooc_id or cooc_matrix param")
-
-    elif cooc_id:
-        cooccurrences = (session.query(NodeNgramNgram)
-                        .filter(NodeNgramNgram.node_id==cooc_id)
-                        )
-        # no filtering: cooc already filtered on mainlist_id at creation
-        for cooccurrence in cooccurrences:
-            matrix[cooccurrence.ngram1_id][cooccurrence.ngram2_id] = cooccurrence.weight
-            matrix[cooccurrence.ngram2_id][cooccurrence.ngram1_id] = cooccurrence.weight
-
-    elif cooc_matrix:
-        # copy WeightedMatrix into local matrix structure
-        for (ngram1_id, ngram2_id) in cooc_matrix.items:
-            w = cooc_matrix.items[(ngram1_id, ngram2_id)]
-            matrix[ngram1_id][ngram2_id] = w
-
-    nb_ngrams = len(matrix)
-
-    print("SPECIFICITY: computing on %i ngrams" % nb_ngrams)
-
-    x = DataFrame(matrix).fillna(0)
-
-    # proba (x/y) ( <= on divise chaque ligne par son total)
-    x = x / x.sum(axis=1)
-
-    # vectorisation
-    # d:Matrix => v: Vector (len = nb_ngrams)
-    # v = d.sum(axis=1) (- lui-même)
-    xs = x.sum(axis=1) - x
-    ys = x.sum(axis=0) - x
-
-
-    # top inclus ou exclus
-    #n = ( xs + ys) / (2 * (x.shape[0] - 1))
-
-    # top generic or specific (asc is spec, desc is generic)
-    v = ( xs - ys) / ( 2 * (x.shape[0] - 1))
-
-    ## d ##
-    #######
-    #               Grenelle  biodiversité  kilomètres  site  élus  île
-    # Grenelle             0             0           4     0     0    0
-    # biodiversité         0             0           0     0     4    0
-    # kilomètres           4             0           0     0     4    0
-    # site                 0             0           0     0     4    6
-    # élus                 0             4           4     4     0    0
-    # île                  0             0           0     6     0    0
-
-
-    ## d.sum(axis=1) ##
-    ###################
-    # Grenelle         4
-    # biodiversité     4
-    # kilomètres       8
-    # site            10
-    # élus            12
-    # île              6
-
-    # résultat temporaire
-    # -------------------
-    # pour l'instant on va utiliser les sommes en ligne comme ranking de spécificité
-    # (**même** ordre qu'avec la formule d'avant le refactoring mais calcul + simple)
-    # TODO analyser la cohérence math ET sem de cet indicateur
-    #v.sort_values(inplace=True)
-
-    # [ ('biodiversité' , 0.333 ),
-    #   ('Grenelle'     , 0.5   ),
-    #   ('île'          , 0.599 ),
-    #   ('kilomètres'   , 1.333 ),
-    #   ('site'         , 1.333 ),
-    #   ('élus'         , 1.899 ) ]
-
-    # ----------------
-    # specificity node
-    if overwrite_id:
-        # overwrite pre-existing id
-        the_id = overwrite_id
-        session.query(NodeNgram).filter(NodeNgram.node_id==the_id).delete()
-        session.commit()
-    else:
-        specnode = corpus.add_child(
-            typename  = "SPECIFICITY",
-            name = "Specif (in:%s)" % corpus.id
-        )
-        session.add(specnode)
-        session.commit()
-        the_id = specnode.id
-
-    # print(v)
-    pd.options.display.float_format = '${:,.2f}'.format
-    if not v.empty:
-        data = WeightedList(
-                zip(  v.index.tolist()
-                    , v.values.tolist()[0]
-                 )
-               )
-        data.save(the_id)
-    else:
-        print("WARNING: had no terms in COOCS => empty SPECIFICITY node")
-
-    return(the_id)
--- a/gargantext/util/toolchain/ngrams_extraction.py
+++ b/gargantext/util/toolchain/ngrams_extraction.py
@@ -48,7 +48,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', ), do_subngrams = DEFAULT_
        ngrams_data = set()
        # extract ngrams
        resource_type_index = corpus.resources()[0]['type']
-        documents_count = 0
+
        resource_type = RESOURCETYPES[resource_type_index]
        default_language_iso2 = resource_type['default_language']
        for documents_count, document in enumerate(corpus.children('DOCUMENT')):