new args 'start' and 'end' in toolchain.ngram_coocs (+ fixing NodeHyperdata slightly)

9ab3433d · Romain Loth · 5d3417fc · 9ab3433d · 9ab3433d · 9ab3433d
Commit 9ab3433d authored Mar 22, 2016 by Romain Loth
5 changed files
--- a/gargantext/constants.py
+++ b/gargantext/constants.py
@@ -37,6 +37,7 @@ NODETYPES = [
    'TFIDF-GLOBAL',          # 14
 ]
+# TODO find somewhere else than constants.py for function
 import datetime
 import dateutil
 def convert_to_date(date):
@@ -46,6 +47,7 @@ def convert_to_date(date):
        return dateutil.parser.parse(date)
 INDEXED_HYPERDATA = {
+    # TODO use properties during toolchain.hyperdata_indexing (type, convert_to_db, convert_from_db)
    'publication_date':
        {'id': 1, 'type': datetime.datetime, 'convert_to_db': convert_to_date, 'convert_from_db': datetime.datetime.fromtimestamp},
    'title':

--- a/gargantext/models/hyperdata.py
+++ b/gargantext/models/hyperdata.py
@@ -25,20 +25,20 @@ class HyperdataValueComparer(object):
 class HyperdataKey(TypeDecorator):
-    """Define a new type of column to describe a Node's type.
+    """Define a new type of column to describe a Hyperdata field's type.
    Internally, this column type is implemented as an SQL integer.
-    Values are detailed in `gargantext.constants.NODETYPES`.
+    Values are detailed in `gargantext.constants.INDEXED_HYPERDATA`.
    """
    impl = Integer
    def process_bind_param(self, keyname, dialect):
        if keyname in INDEXED_HYPERDATA:
-            return INDEXED_HYPERDATA[keyname]
+            return INDEXED_HYPERDATA[keyname]['id']
-        raise ValueError('Hyperdata key "%s" was not found in `gargantext.constants.NODETYPES`' % keyname)
+        raise ValueError('Hyperdata key "%s" was not found in `gargantext.constants.INDEXED_HYPERDATA`' % keyname)
    def process_result_value(self, keyindex, dialect):
-        for keyname, key in INDEXED_HYPERDATA:
+        for keyname, keysubhash in INDEXED_HYPERDATA.items():
-            if key['id'] == keyindex:
+            if keysubhash['id'] == keyindex:
                return keyname
-        raise ValueError('Hyperdata key with id=%d was not found in `gargantext.constants.NODETYPES`' % keyindex)
+        raise ValueError('Hyperdata key with id=%d was not found in `gargantext.constants.INDEXED_HYPERDATA`' % keyindex)
 class NodeHyperdata(Base):
@@ -85,6 +85,7 @@ class NodeHyperdata(Base):
        # value
        self.value = value
+    # FIXME
    @property
    def value(self):
        """Pseudo-attribute used to extract the value in the right format.
@@ -123,6 +124,7 @@ def HyperdataValueComparer_overrider(key):
        if isinstance(args[0], str):
            return getattr(NodeHyperdata.value_str, key)(*args)
    return comparator
+# ??
 for key in set(dir(NodeHyperdata.value_flt) + dir(NodeHyperdata.value_str)):
    if key in ('__dict__', '__weakref__', '__repr__', '__str__') or 'attr' in key or 'class' in key or 'init' in key or 'new' in key:
        continue

--- a/gargantext/util/parsers/_Parser.py
+++ b/gargantext/util/parsers/_Parser.py
@@ -78,6 +78,7 @@ class Parser:
                except:
                    pass
        else:
+            print("WARNING: Date unknown at _Parser level, using now()")
            hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        # ...then parse all the "date" fields, to parse it into separate elements

--- a/gargantext/util/toolchain/hyperdata_indexing.py
+++ b/gargantext/util/toolchain/hyperdata_indexing.py
 from gargantext.util.db import bulk_insert
 from gargantext.constants import INDEXED_HYPERDATA
 from gargantext.models import NodeHyperdata
+from datetime          import datetime
 def _nodes_hyperdata_generator(corpus):
    """This method generates columns for insertions in `nodes_hyperdata`.
    In case one of the values is a list, its items are iterated over and
    yielded separately.
+    If its a string (eg date) it will be truncated to 255 chars
    """
    for document in corpus.children(typename='DOCUMENT'):
        for keyname, key in INDEXED_HYPERDATA.items():
@@ -29,6 +30,16 @@ def _nodes_hyperdata_generator(corpus):
                            None,
                            value[:255],
                        )
+                    elif isinstance(value, (datetime, )):
+                        yield (
+                            document.id,
+                            key['id'],
+                            None,
+                            # value_str
+                            value.strftime("%Y-%m-%d %H:%M:%S"),
+                        )
+                    else:
+                        print("WARNING: Couldn't insert an INDEXED_HYPERDATA value because of unknown type:", type(value))
 def index_hyperdata(corpus):

--- a/gargantext/util/toolchain/ngram_coocs.py
+++ b/gargantext/util/toolchain/ngram_coocs.py
-from gargantext.models         import Node, NodeNgram, NodeNgramNgram
+from gargantext.models         import Node, NodeNgram, NodeNgramNgram, \
+                                      NodeHyperdata
 from gargantext.util.lists     import WeightedMatrix
 from gargantext.util.db        import session, aliased, func
 from gargantext.util.db_cache  import cache
 from gargantext.constants      import DEFAULT_COOC_THRESHOLD
+from datetime                  import datetime
 def compute_coocs(corpus,
                    overwrite_id  = None,
                    threshold     = DEFAULT_COOC_THRESHOLD,
                    mainlist_id     = None,
                    stoplist_id     = None,
+                    start           = None,
+                    end             = None,
                    symmetry_filter = True):
    """
    Count how often some extracted terms appear
@@ -40,6 +44,10 @@ def compute_coocs(corpus,
      - mainlist_id: mainlist to constrain the input ngrams
      - stoplist_id: stoplist for filtering input ngrams
                     (normally unnecessary if a mainlist is provided)
+      - start, end: provide one or both temporal limits to filter on doc date
+                    NB the expected type of parameter value is datetime.datetime
+                        (string is also possible but format must follow
+                          this convention: "2001-01-01" aka "%Y-%m-%d")
     (deprecated parameters)
      - field1,2: allowed to count other things than ngrams (eg tags) but no use case at present
@@ -68,7 +76,6 @@ def compute_coocs(corpus,
        #   - TODO cvalue_id: allow a metric as additional  input filter
        #   - TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
-        #   - TODO start, end : filter on document date
        #   - TODO weighted: if False normal cooc to be saved as result
        #                    if True  weighted cooc (experimental)
@@ -127,6 +134,42 @@ def compute_coocs(corpus,
            .filter( ~ x2.ngram_id.in_(stop_subquery) )
        )
+    if start:
+        if isinstance(start, datetime):
+            start_str = start.strftime("%Y-%m-%d %H:%M:%S")
+        else:
+            start_str = str(start)
+        # doc_ids matching this limit
+        starttime_subquery = (session
+                                .query(NodeHyperdata.node_id)
+                                .filter(NodeHyperdata.key=="publication_date")
+                                .filter(NodeHyperdata.value_str >= start_str)
+                                .subquery()
+                           )
+        # direct use of str comparison op because there is consistency b/w
+        # sql alpha sort and chrono sort *in this format %Y-%m-%d %H:%M:%S*
+        # the filtering by start limit
+        coocs_query = coocs_query.filter(x1.node_id.in_(starttime_subquery))
+    if end:
+        if isinstance(end, datetime):
+            end_str = end.strftime("%Y-%m-%d %H:%M:%S")
+        else:
+            end_str = str(end)
+        endtime_subquery = (session
+                                .query(NodeHyperdata.node_id)
+                                .filter(NodeHyperdata.key=="publication_date")
+                                .filter(NodeHyperdata.value_str <= end_str)
+                                .subquery()
+                           )
+        # the filtering by end limit
+        coocs_query = coocs_query.filter(x1.node_id.in_(endtime_subquery))
    if symmetry_filter:
        # 1 filtre tenant en compte de la symétrie
        #  -> réduit le travail de moitié !!