Commit 9ab3433d authored by Romain Loth's avatar Romain Loth

new args 'start' and 'end' in toolchain.ngram_coocs (+ fixing NodeHyperdata slightly)

parent 5d3417fc
...@@ -37,6 +37,7 @@ NODETYPES = [ ...@@ -37,6 +37,7 @@ NODETYPES = [
'TFIDF-GLOBAL', # 14 'TFIDF-GLOBAL', # 14
] ]
# TODO find somewhere else than constants.py for function
import datetime import datetime
import dateutil import dateutil
def convert_to_date(date): def convert_to_date(date):
...@@ -46,6 +47,7 @@ def convert_to_date(date): ...@@ -46,6 +47,7 @@ def convert_to_date(date):
return dateutil.parser.parse(date) return dateutil.parser.parse(date)
INDEXED_HYPERDATA = { INDEXED_HYPERDATA = {
# TODO use properties during toolchain.hyperdata_indexing (type, convert_to_db, convert_from_db)
'publication_date': 'publication_date':
{'id': 1, 'type': datetime.datetime, 'convert_to_db': convert_to_date, 'convert_from_db': datetime.datetime.fromtimestamp}, {'id': 1, 'type': datetime.datetime, 'convert_to_db': convert_to_date, 'convert_from_db': datetime.datetime.fromtimestamp},
'title': 'title':
......
...@@ -25,20 +25,20 @@ class HyperdataValueComparer(object): ...@@ -25,20 +25,20 @@ class HyperdataValueComparer(object):
class HyperdataKey(TypeDecorator): class HyperdataKey(TypeDecorator):
"""Define a new type of column to describe a Node's type. """Define a new type of column to describe a Hyperdata field's type.
Internally, this column type is implemented as an SQL integer. Internally, this column type is implemented as an SQL integer.
Values are detailed in `gargantext.constants.NODETYPES`. Values are detailed in `gargantext.constants.INDEXED_HYPERDATA`.
""" """
impl = Integer impl = Integer
def process_bind_param(self, keyname, dialect): def process_bind_param(self, keyname, dialect):
if keyname in INDEXED_HYPERDATA: if keyname in INDEXED_HYPERDATA:
return INDEXED_HYPERDATA[keyname] return INDEXED_HYPERDATA[keyname]['id']
raise ValueError('Hyperdata key "%s" was not found in `gargantext.constants.NODETYPES`' % keyname) raise ValueError('Hyperdata key "%s" was not found in `gargantext.constants.INDEXED_HYPERDATA`' % keyname)
def process_result_value(self, keyindex, dialect): def process_result_value(self, keyindex, dialect):
for keyname, key in INDEXED_HYPERDATA: for keyname, keysubhash in INDEXED_HYPERDATA.items():
if key['id'] == keyindex: if keysubhash['id'] == keyindex:
return keyname return keyname
raise ValueError('Hyperdata key with id=%d was not found in `gargantext.constants.NODETYPES`' % keyindex) raise ValueError('Hyperdata key with id=%d was not found in `gargantext.constants.INDEXED_HYPERDATA`' % keyindex)
class NodeHyperdata(Base): class NodeHyperdata(Base):
...@@ -85,6 +85,7 @@ class NodeHyperdata(Base): ...@@ -85,6 +85,7 @@ class NodeHyperdata(Base):
# value # value
self.value = value self.value = value
# FIXME
@property @property
def value(self): def value(self):
"""Pseudo-attribute used to extract the value in the right format. """Pseudo-attribute used to extract the value in the right format.
...@@ -123,6 +124,7 @@ def HyperdataValueComparer_overrider(key): ...@@ -123,6 +124,7 @@ def HyperdataValueComparer_overrider(key):
if isinstance(args[0], str): if isinstance(args[0], str):
return getattr(NodeHyperdata.value_str, key)(*args) return getattr(NodeHyperdata.value_str, key)(*args)
return comparator return comparator
# ??
for key in set(dir(NodeHyperdata.value_flt) + dir(NodeHyperdata.value_str)): for key in set(dir(NodeHyperdata.value_flt) + dir(NodeHyperdata.value_str)):
if key in ('__dict__', '__weakref__', '__repr__', '__str__') or 'attr' in key or 'class' in key or 'init' in key or 'new' in key: if key in ('__dict__', '__weakref__', '__repr__', '__str__') or 'attr' in key or 'class' in key or 'init' in key or 'new' in key:
continue continue
......
...@@ -78,6 +78,7 @@ class Parser: ...@@ -78,6 +78,7 @@ class Parser:
except: except:
pass pass
else: else:
print("WARNING: Date unknown at _Parser level, using now()")
hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# ...then parse all the "date" fields, to parse it into separate elements # ...then parse all the "date" fields, to parse it into separate elements
......
from gargantext.util.db import bulk_insert from gargantext.util.db import bulk_insert
from gargantext.constants import INDEXED_HYPERDATA from gargantext.constants import INDEXED_HYPERDATA
from gargantext.models import NodeHyperdata from gargantext.models import NodeHyperdata
from datetime import datetime
def _nodes_hyperdata_generator(corpus): def _nodes_hyperdata_generator(corpus):
"""This method generates columns for insertions in `nodes_hyperdata`. """This method generates columns for insertions in `nodes_hyperdata`.
In case one of the values is a list, its items are iterated over and In case one of the values is a list, its items are iterated over and
yielded separately. yielded separately.
If its a string (eg date) it will be truncated to 255 chars
""" """
for document in corpus.children(typename='DOCUMENT'): for document in corpus.children(typename='DOCUMENT'):
for keyname, key in INDEXED_HYPERDATA.items(): for keyname, key in INDEXED_HYPERDATA.items():
...@@ -29,6 +30,16 @@ def _nodes_hyperdata_generator(corpus): ...@@ -29,6 +30,16 @@ def _nodes_hyperdata_generator(corpus):
None, None,
value[:255], value[:255],
) )
elif isinstance(value, (datetime, )):
yield (
document.id,
key['id'],
None,
# value_str
value.strftime("%Y-%m-%d %H:%M:%S"),
)
else:
print("WARNING: Couldn't insert an INDEXED_HYPERDATA value because of unknown type:", type(value))
def index_hyperdata(corpus): def index_hyperdata(corpus):
......
from gargantext.models import Node, NodeNgram, NodeNgramNgram from gargantext.models import Node, NodeNgram, NodeNgramNgram, \
NodeHyperdata
from gargantext.util.lists import WeightedMatrix from gargantext.util.lists import WeightedMatrix
from gargantext.util.db import session, aliased, func from gargantext.util.db import session, aliased, func
from gargantext.util.db_cache import cache from gargantext.util.db_cache import cache
from gargantext.constants import DEFAULT_COOC_THRESHOLD from gargantext.constants import DEFAULT_COOC_THRESHOLD
from datetime import datetime
def compute_coocs(corpus, def compute_coocs(corpus,
overwrite_id = None, overwrite_id = None,
threshold = DEFAULT_COOC_THRESHOLD, threshold = DEFAULT_COOC_THRESHOLD,
mainlist_id = None, mainlist_id = None,
stoplist_id = None, stoplist_id = None,
start = None,
end = None,
symmetry_filter = True): symmetry_filter = True):
""" """
Count how often some extracted terms appear Count how often some extracted terms appear
...@@ -40,6 +44,10 @@ def compute_coocs(corpus, ...@@ -40,6 +44,10 @@ def compute_coocs(corpus,
- mainlist_id: mainlist to constrain the input ngrams - mainlist_id: mainlist to constrain the input ngrams
- stoplist_id: stoplist for filtering input ngrams - stoplist_id: stoplist for filtering input ngrams
(normally unnecessary if a mainlist is provided) (normally unnecessary if a mainlist is provided)
- start, end: provide one or both temporal limits to filter on doc date
NB the expected type of parameter value is datetime.datetime
(string is also possible but format must follow
this convention: "2001-01-01" aka "%Y-%m-%d")
(deprecated parameters) (deprecated parameters)
- field1,2: allowed to count other things than ngrams (eg tags) but no use case at present - field1,2: allowed to count other things than ngrams (eg tags) but no use case at present
...@@ -68,7 +76,6 @@ def compute_coocs(corpus, ...@@ -68,7 +76,6 @@ def compute_coocs(corpus,
# - TODO cvalue_id: allow a metric as additional input filter # - TODO cvalue_id: allow a metric as additional input filter
# - TODO n_min, n_max : filter on Ngram.n (aka length of ngram) # - TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
# - TODO start, end : filter on document date
# - TODO weighted: if False normal cooc to be saved as result # - TODO weighted: if False normal cooc to be saved as result
# if True weighted cooc (experimental) # if True weighted cooc (experimental)
...@@ -127,6 +134,42 @@ def compute_coocs(corpus, ...@@ -127,6 +134,42 @@ def compute_coocs(corpus,
.filter( ~ x2.ngram_id.in_(stop_subquery) ) .filter( ~ x2.ngram_id.in_(stop_subquery) )
) )
if start:
if isinstance(start, datetime):
start_str = start.strftime("%Y-%m-%d %H:%M:%S")
else:
start_str = str(start)
# doc_ids matching this limit
starttime_subquery = (session
.query(NodeHyperdata.node_id)
.filter(NodeHyperdata.key=="publication_date")
.filter(NodeHyperdata.value_str >= start_str)
.subquery()
)
# direct use of str comparison op because there is consistency b/w
# sql alpha sort and chrono sort *in this format %Y-%m-%d %H:%M:%S*
# the filtering by start limit
coocs_query = coocs_query.filter(x1.node_id.in_(starttime_subquery))
if end:
if isinstance(end, datetime):
end_str = end.strftime("%Y-%m-%d %H:%M:%S")
else:
end_str = str(end)
endtime_subquery = (session
.query(NodeHyperdata.node_id)
.filter(NodeHyperdata.key=="publication_date")
.filter(NodeHyperdata.value_str <= end_str)
.subquery()
)
# the filtering by end limit
coocs_query = coocs_query.filter(x1.node_id.in_(endtime_subquery))
if symmetry_filter: if symmetry_filter:
# 1 filtre tenant en compte de la symétrie # 1 filtre tenant en compte de la symétrie
# -> réduit le travail de moitié !! # -> réduit le travail de moitié !!
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment