Commit 9ab3433d authored by Romain Loth's avatar Romain Loth

new args 'start' and 'end' in toolchain.ngram_coocs (+ fixing NodeHyperdata slightly)

parent 5d3417fc
......@@ -37,6 +37,7 @@ NODETYPES = [
'TFIDF-GLOBAL', # 14
]
# TODO find somewhere else than constants.py for function
import datetime
import dateutil
def convert_to_date(date):
......@@ -46,6 +47,7 @@ def convert_to_date(date):
return dateutil.parser.parse(date)
INDEXED_HYPERDATA = {
# TODO use properties during toolchain.hyperdata_indexing (type, convert_to_db, convert_from_db)
'publication_date':
{'id': 1, 'type': datetime.datetime, 'convert_to_db': convert_to_date, 'convert_from_db': datetime.datetime.fromtimestamp},
'title':
......
......@@ -25,20 +25,20 @@ class HyperdataValueComparer(object):
class HyperdataKey(TypeDecorator):
"""Define a new type of column to describe a Node's type.
"""Define a new type of column to describe a Hyperdata field's type.
Internally, this column type is implemented as an SQL integer.
Values are detailed in `gargantext.constants.NODETYPES`.
Values are detailed in `gargantext.constants.INDEXED_HYPERDATA`.
"""
impl = Integer
def process_bind_param(self, keyname, dialect):
if keyname in INDEXED_HYPERDATA:
return INDEXED_HYPERDATA[keyname]
raise ValueError('Hyperdata key "%s" was not found in `gargantext.constants.NODETYPES`' % keyname)
return INDEXED_HYPERDATA[keyname]['id']
raise ValueError('Hyperdata key "%s" was not found in `gargantext.constants.INDEXED_HYPERDATA`' % keyname)
def process_result_value(self, keyindex, dialect):
for keyname, key in INDEXED_HYPERDATA:
if key['id'] == keyindex:
for keyname, keysubhash in INDEXED_HYPERDATA.items():
if keysubhash['id'] == keyindex:
return keyname
raise ValueError('Hyperdata key with id=%d was not found in `gargantext.constants.NODETYPES`' % keyindex)
raise ValueError('Hyperdata key with id=%d was not found in `gargantext.constants.INDEXED_HYPERDATA`' % keyindex)
class NodeHyperdata(Base):
......@@ -85,6 +85,7 @@ class NodeHyperdata(Base):
# value
self.value = value
# FIXME
@property
def value(self):
"""Pseudo-attribute used to extract the value in the right format.
......@@ -123,6 +124,7 @@ def HyperdataValueComparer_overrider(key):
if isinstance(args[0], str):
return getattr(NodeHyperdata.value_str, key)(*args)
return comparator
# ??
for key in set(dir(NodeHyperdata.value_flt) + dir(NodeHyperdata.value_str)):
if key in ('__dict__', '__weakref__', '__repr__', '__str__') or 'attr' in key or 'class' in key or 'init' in key or 'new' in key:
continue
......
......@@ -78,6 +78,7 @@ class Parser:
except:
pass
else:
print("WARNING: Date unknown at _Parser level, using now()")
hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# ...then parse all the "date" fields, to parse it into separate elements
......
from gargantext.util.db import bulk_insert
from gargantext.constants import INDEXED_HYPERDATA
from gargantext.models import NodeHyperdata
from datetime import datetime
def _nodes_hyperdata_generator(corpus):
"""This method generates columns for insertions in `nodes_hyperdata`.
In case one of the values is a list, its items are iterated over and
yielded separately.
If its a string (eg date) it will be truncated to 255 chars
"""
for document in corpus.children(typename='DOCUMENT'):
for keyname, key in INDEXED_HYPERDATA.items():
......@@ -29,6 +30,16 @@ def _nodes_hyperdata_generator(corpus):
None,
value[:255],
)
elif isinstance(value, (datetime, )):
yield (
document.id,
key['id'],
None,
# value_str
value.strftime("%Y-%m-%d %H:%M:%S"),
)
else:
print("WARNING: Couldn't insert an INDEXED_HYPERDATA value because of unknown type:", type(value))
def index_hyperdata(corpus):
......
from gargantext.models import Node, NodeNgram, NodeNgramNgram
from gargantext.models import Node, NodeNgram, NodeNgramNgram, \
NodeHyperdata
from gargantext.util.lists import WeightedMatrix
from gargantext.util.db import session, aliased, func
from gargantext.util.db_cache import cache
from gargantext.constants import DEFAULT_COOC_THRESHOLD
from datetime import datetime
def compute_coocs(corpus,
overwrite_id = None,
threshold = DEFAULT_COOC_THRESHOLD,
mainlist_id = None,
stoplist_id = None,
start = None,
end = None,
symmetry_filter = True):
"""
Count how often some extracted terms appear
......@@ -40,6 +44,10 @@ def compute_coocs(corpus,
- mainlist_id: mainlist to constrain the input ngrams
- stoplist_id: stoplist for filtering input ngrams
(normally unnecessary if a mainlist is provided)
- start, end: provide one or both temporal limits to filter on doc date
NB the expected type of parameter value is datetime.datetime
(string is also possible but format must follow
this convention: "2001-01-01" aka "%Y-%m-%d")
(deprecated parameters)
- field1,2: allowed to count other things than ngrams (eg tags) but no use case at present
......@@ -68,7 +76,6 @@ def compute_coocs(corpus,
# - TODO cvalue_id: allow a metric as additional input filter
# - TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
# - TODO start, end : filter on document date
# - TODO weighted: if False normal cooc to be saved as result
# if True weighted cooc (experimental)
......@@ -127,6 +134,42 @@ def compute_coocs(corpus,
.filter( ~ x2.ngram_id.in_(stop_subquery) )
)
if start:
if isinstance(start, datetime):
start_str = start.strftime("%Y-%m-%d %H:%M:%S")
else:
start_str = str(start)
# doc_ids matching this limit
starttime_subquery = (session
.query(NodeHyperdata.node_id)
.filter(NodeHyperdata.key=="publication_date")
.filter(NodeHyperdata.value_str >= start_str)
.subquery()
)
# direct use of str comparison op because there is consistency b/w
# sql alpha sort and chrono sort *in this format %Y-%m-%d %H:%M:%S*
# the filtering by start limit
coocs_query = coocs_query.filter(x1.node_id.in_(starttime_subquery))
if end:
if isinstance(end, datetime):
end_str = end.strftime("%Y-%m-%d %H:%M:%S")
else:
end_str = str(end)
endtime_subquery = (session
.query(NodeHyperdata.node_id)
.filter(NodeHyperdata.key=="publication_date")
.filter(NodeHyperdata.value_str <= end_str)
.subquery()
)
# the filtering by end limit
coocs_query = coocs_query.filter(x1.node_id.in_(endtime_subquery))
if symmetry_filter:
# 1 filtre tenant en compte de la symétrie
# -> réduit le travail de moitié !!
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment