Commit ab2c76dd authored by Romain Loth's avatar Romain Loth

Merge branch 'refactoring' into refactoring-rom

Conflicts:
	gargantext/views/api/urls.py
parents 63ec1b5c 8be7e5a7
......@@ -4,7 +4,8 @@ from annotations import views
# /!\ urls patterns here are *without* the trailing slash
urlpatterns = patterns('',
urlpatterns = [
# json:title,id,authors,journal,
# publication_date
# abstract_text,full_text
......@@ -16,4 +17,4 @@ urlpatterns = patterns('',
# url(r'^lists/(?P<list_id>[0-9]+)/ngrams/(?P<ngram_ids>[0-9,\+]+)+$', views.NgramEdit.as_view()),
# POST (fixed 2015-12-16)
# url(r'^lists/(?P<list_id>[0-9]+)/ngrams/create$', views.NgramCreate.as_view()), #
)
]
# django.ini file
[uwsgi]
# uwsgi --vacuum --socket monsite/mysite.sock --wsgi-file monsite/wsgi.py --chmod-socket=666 --home=/srv/alexandre.delanoe/env --chdir=/var/www/www/alexandre/monsite --env
env = DJANGO_SETTINGS_MODULE=gargantext.settings
#module = django.core.handlers.wsgi:WSGIHandler()
plugins = python35
# the base directory
chdir = /srv/gargantext
# Django's wsgi file
#module = wsgi
wsgi-file = /srv/gargantext/gargantext/wsgi.py
# the virtualenv
home = /srv/gargantext_env_3.5
lazy-apps = True
# master
master = true
# maximum number of processes
processes = 10
# the socket (use the full path to be safe)
socket = /tmp/gargantext.sock
threads = 4
# with appropriate permissions - *may* be needed
chmod-socket = 666
# clear environment on exit
vacuum = true
pidfile = /tmp/gargantext.pid
# touch /tmp/gargantext.reload to reload configuration (after git pull for instance)
touch-reload = /tmp/gargantext.reload
# respawn processes taking more than 20 seconds
harakiri = 120
# limit the project to 128 MB
#limit-as = 128
# respawn processes after serving 5000 requests
max-requests = 5000
# background the process & log
#daemonize = /var/log/uwsgi/gargantext.log
uid = 1000
gid = 1000
################### other gargantext constants ###################
[scrappers]
# default number of docs POSTed to scrappers.views.py
# (at page project > add a corpus > scan/process sample)
QUERY_SIZE_N_DEFAULT = 1000
# checked just before scrap to prevent running impossible workflows
# even if somebody would set "query size N" manually in POST data
QUERY_SIZE_N_MAX = 20000
......@@ -37,14 +37,86 @@ NODETYPES = [
'TFIDF-GLOBAL', # 14
]
# TODO find somewhere else than constants.py for function
import datetime
import dateutil
def convert_to_date(date):
if isinstance(date, (int, float)):
return datetime.datetime.timestamp(date)
else:
return dateutil.parser.parse(date)
INDEXED_HYPERDATA = {
# TODO use properties during toolchain.hyperdata_indexing
# (type, convert_to_db, convert_from_db)
'count':
{ 'id' : 1
, 'type' : int
, 'convert_to_db' : int
, 'convert_from_db': int
},
'publication_date':
{ 'id' : 2
, 'type' : datetime.datetime
, 'convert_to_db' : convert_to_date
, 'convert_from_db': datetime.datetime.fromtimestamp
},
'title':
{ 'id' : 3
, 'type' : str
, 'convert_to_db' : str
, 'convert_from_db': str
},
'authors':
{ 'id' : 4
, 'type' : str
, 'convert_to_db' : str
, 'convert_from_db': str
},
'journal':
{ 'id' : 5
, 'type' : str
, 'convert_to_db' : str
, 'convert_from_db': str
},
'abstract':
{ 'id' : 6
, 'type' : str
, 'convert_to_db' : str
, 'convert_from_db': str
},
'text':
{ 'id' : 7
, 'type' : str
, 'convert_to_db' : str
, 'convert_from_db': str
},
'page':
{ 'id' : 8
, 'type' : int
, 'convert_to_db' : int
, 'convert_from_db': int
},
}
from gargantext.util.taggers import *
LANGUAGES = {
'en': {
'tagger': TurboTagger,
# 'tagger': EnglishMeltTagger,
# 'tagger': NltkTagger,
'tagger': EnglishMeltTagger,
#'tagger': TurboTagger,
#'tagger': NltkTagger,
},
'fr': {
'tagger': FrenchMeltTagger,
......
from .nodes import *
from .hyperdata import *
from .users import *
from .ngrams import *
from gargantext.util.db import *
from gargantext.constants import INDEXED_HYPERDATA
from .nodes import Node
import datetime
__all__ = ['NodeHyperdata']
class classproperty(object):
"""See: http://stackoverflow.com/a/3203659/734335
"""
def __init__(self, getter):
self.getter = getter
def __get__(self, instance, owner):
return self.getter(owner)
class HyperdataValueComparer(object):
"""This class is there to allow hyperdata comparison.
Its attribute are overrided at the end of the present module to fit those
of the `value_flt` and `value_str` attributes of the `NodeHyperdata` class.
"""
class HyperdataKey(TypeDecorator):
"""Define a new type of column to describe a Hyperdata field's type.
Internally, this column type is implemented as an SQL integer.
Values are detailed in `gargantext.constants.INDEXED_HYPERDATA`.
"""
impl = Integer
def process_bind_param(self, keyname, dialect):
if keyname in INDEXED_HYPERDATA:
return INDEXED_HYPERDATA[keyname]['id']
raise ValueError('Hyperdata key "%s" was not found in `gargantext.constants.INDEXED_HYPERDATA`' % keyname)
def process_result_value(self, keyindex, dialect):
for keyname, keysubhash in INDEXED_HYPERDATA.items():
if keysubhash['id'] == keyindex:
return keyname
raise ValueError('Hyperdata key with id=%d was not found in `gargantext.constants.INDEXED_HYPERDATA`' % keyindex)
class NodeHyperdata(Base):
"""This model's primary role is to allow better indexation of hyperdata.
It stores values contained in the `nodes.hyperdata` column (only those
listed in `gargantext.constants.INDEXED_HYPERDATA`), associated with the
corresponding key's index, and hyperdata value.
Example:
query = (session
.query(Node)
.join(NodeHyperdata)
.filter(NodeHyperdata.key == 'title')
.filter(NodeHyperdata.value.startswith('Bees'))
)
Example:
query = (session
.query(Node)
.join(NodeHyperdata)
.filter(NodeHyperdata.key == 'publication_date')
.filter(NodeHyperdata.value > datetime.datetime.now())
)
"""
__tablename__ = 'nodes_hyperdata'
id = Column( Integer, primary_key=True )
node_id = Column( Integer, ForeignKey(Node.id, ondelete='CASCADE'))
key = Column( HyperdataKey )
value_int = Column( Integer , index=True )
value_flt = Column( Double() , index=True )
value_utc = Column( DateTime(timezone=True) , index=True )
value_str = Column( String(255) , index=True )
value_txt = Column( Text , index=True )
def __init__(self, node=None, key=None, value=None):
"""Custom constructor
"""
# node reference
if node is not None:
if hasattr(node, 'id'):
self.node_id = node.id
else:
self.node_id = node
# key
if key is not None:
self.key = key
# value
self.value = value
# FIXME
@property
def value(self):
"""Pseudo-attribute used to extract the value in the right format.
"""
key = INDEXED_HYPERDATA[self.key]
return key['convert_from_db'](
self.value_flt if (self.value_str is None) else self.value_str
)
@value.setter
def value(self, value):
"""Pseudo-attribute used to insert the value in the right format.
"""
key = INDEXED_HYPERDATA[self.key]
value = key['convert_to_db'](value)
if isinstance(value, str):
self.value_str = value
else:
self.value_flt = value
@classproperty
def value(cls):
"""Pseudo-attribute used for hyperdata comparison inside a query.
"""
return HyperdataValueComparer()
def HyperdataValueComparer_overrider(key):
def comparator(self, *args):
if len(args) == 0:
return
if isinstance(args[0], datetime.datetime):
args = tuple(map(datetime.datetime.timestamp, args))
if isinstance(args[0], (int, float)):
return getattr(NodeHyperdata.value_flt, key)(*args)
if isinstance(args[0], str):
return getattr(NodeHyperdata.value_str, key)(*args)
return comparator
# ??
for key in set(dir(NodeHyperdata.value_flt) + dir(NodeHyperdata.value_str)):
if key in ( '__dict__'
, '__weakref__'
, '__repr__'
, '__str__') \
or 'attr' in key \
or 'class' in key \
or 'init' in key \
or 'new' in key :
continue
setattr(HyperdataValueComparer, key, HyperdataValueComparer_overrider(key))
"""URL Configuration of GarganText
Views are shared between three main modules:
Views are shared between these modules:
- `api`, for JSON and CSV interaction with data
- `pages`, to present HTML views to the user
- `contents`, for Python-generated contents
- `annotations`, to annotate local context of a corpus (as global context)
- `graph explorer`, to explore graphs
"""
from django.conf.urls import include, url
......@@ -14,10 +16,15 @@ import gargantext.views.api.urls
import gargantext.views.generated.urls
import gargantext.views.pages.urls
# tempo: unchanged doc-annotations --
# Module Annotation
## tempo: unchanged doc-annotations --
from annotations import urls as annotations_urls
from annotations.views import main as annotations_main_view
# Module "Graph Explorer"
#from graphExplorer import urls as graphExplorer_urls
from graphExplorer.rest import Graph
from graphExplorer.views import explorer
urlpatterns = [
url(r'^admin/', admin.site.urls),
......@@ -25,7 +32,16 @@ urlpatterns = [
url(r'^api/', include(gargantext.views.api.urls)),
url(r'^', include(gargantext.views.pages.urls)),
# Module Annotation
# tempo: unchanged doc-annotations routes --
url(r'^annotations/', include(annotations_urls)),
url(r'^projects/(\d+)/corpora/(\d+)/documents/(\d+)/$', annotations_main_view),
url(r'^annotations/', include(annotations_urls))
# Module "Graph Explorer"
url(r'^projects/(\d+)/corpora/(\d+)/explorer$', explorer),
url(r'^projects/(\d+)/corpora/(\d+)/graph$', Graph.as_view()),
# to be removed:
url(r'^projects/(\d+)/corpora/(\d+)/node_link.json$', Graph.as_view())
#url(r'^projects/(\d+)/corpora/(\d+)/explorer$', include(graphExplorer.urls))
#url(r'^projects/(\d+)/corpora/(\d+)/explorer$', include(graphExplorer_urls))
]
from gargantext import settings
from gargantext.util.json import json_dumps
# get engine, session, etc.
from sqlalchemy.orm import sessionmaker, scoped_session
from sqlalchemy.ext.declarative import declarative_base
from gargantext.util.json import json_dumps
from sqlalchemy import delete
def get_engine():
from sqlalchemy import create_engine
......@@ -28,9 +28,9 @@ session = scoped_session(sessionmaker(bind=engine))
from sqlalchemy.types import *
from sqlalchemy.schema import Column, ForeignKey, UniqueConstraint
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.dialects.postgresql import JSONB, DOUBLE_PRECISION
from sqlalchemy.ext.mutable import MutableDict, MutableList
Double = DOUBLE_PRECISION
# useful for queries
......@@ -76,7 +76,7 @@ class bulk_insert:
try:
return '\t'.join(
value.replace('\\', '\\\\').replace('\n', '\\\n').replace('\r', '\\\r').replace('\t', '\\\t')
if isinstance(value, str) else str(value)
if isinstance(value, str) else str(value) if value is not None else '\\N'
for value in next(self.iter)
) + '\n'
except StopIteration:
......
......@@ -78,6 +78,7 @@ class Parser:
except:
pass
else:
print("WARNING: Date unknown at _Parser level, using now()")
hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# ...then parse all the "date" fields, to parse it into separate elements
......
......@@ -37,7 +37,8 @@ def scheduled_celery(func):
"""Provides a decorator to schedule a task with Celery.
"""
def go(*args, **kwargs):
shared_task(func).apply_async(args=args, kwargs=kwargs)
func.apply_async(args=args, kwargs=kwargs)
#shared_task(func).apply_async(args=args, kwargs=kwargs)
return go
......
from .parsing import parse
from .ngrams_extraction import extract_ngrams
from .hyperdata_indexing import index_hyperdata
# in usual run order
from .list_stop import do_stoplist
......@@ -14,7 +15,9 @@ from gargantext.util.db import session
from gargantext.models import Node
from datetime import datetime
from celery import shared_task
@shared_task
def parse_extract(corpus):
# retrieve corpus from database from id
if isinstance(corpus, int):
......@@ -36,6 +39,24 @@ def parse_extract(corpus):
extract_ngrams(corpus)
print('CORPUS #%d: extracted ngrams' % (corpus.id))
@shared_task
def parse_extract_indexhyperdata(corpus):
# retrieve corpus from database from id
if isinstance(corpus, int):
corpus_id = corpus
corpus = session.query(Node).filter(Node.id == corpus_id).first()
if corpus is None:
print('NO SUCH CORPUS: #%d' % corpus_id)
return
# apply actions
print('CORPUS #%d' % (corpus.id))
parse(corpus)
print('CORPUS #%d: parsed' % (corpus.id))
extract_ngrams(corpus)
print('CORPUS #%d: extracted ngrams' % (corpus.id))
index_hyperdata(corpus)
print('CORPUS #%d: indexed hyperdata' % (corpus.id))
# -------------------------------
# temporary ngram lists workflow
# -------------------------------
......
from gargantext.util.db import bulk_insert
from gargantext.constants import INDEXED_HYPERDATA
from gargantext.models import NodeHyperdata
from datetime import datetime
def _nodes_hyperdata_generator(corpus):
"""This method generates columns for insertions in `nodes_hyperdata`.
In case one of the values is a list, its items are iterated over and
yielded separately.
If its a string (eg date) it will be truncated to 255 chars
"""
for document in corpus.children(typename='DOCUMENT'):
for keyname, key in INDEXED_HYPERDATA.items():
if keyname in document.hyperdata:
values = key['convert_to_db'](document.hyperdata[keyname])
if not isinstance(values, list):
values = [values]
for value in values:
if isinstance(value, (int, )):
yield (
document.id,
key['id'],
value,
None,
None,
None,
None,
)
elif isinstance(value, (float, )):
yield (
document.id,
key['id'],
None,
value,
None,
None,
None,
)
elif isinstance(value, (datetime, )):
yield (
document.id,
key['id'],
None,
None,
value.strftime("%Y-%m-%d %H:%M:%S"),
# FIXME check timestamp +%Z
None,
None,
)
elif isinstance(value, (str, )) :
if len(value) < 255 :
yield (
document.id,
key['id'],
None,
None,
None,
value,
None,
)
elif len(value) < 2712 :
yield (
document.id,
key['id'],
None,
None,
None,
None,
value,
)
else :
print("La taille de la ligne index, \
dépasse le maximum, 2712, pour l'index \
« ix_nodes_hyperdata_value_txt » HINT: \
Les valeurs plus larges qu'un tiers d'une\
page de tampon ne peuvent pas être \
indexées (sur postgres 9.5). TODO : \
Utilisez un index sur le hachage MD5 de la \
valeur et/ou passez à l'indexation de la \
recherche plein texte.")
yield (
document.id,
key['id'],
None,
None,
None,
None,
value[:2712],
)
else:
print("WARNING: Couldn't insert an INDEXED_HYPERDATA value because of unknown type:", type(value))
def index_hyperdata(corpus):
bulk_insert(
table = NodeHyperdata,
fields = ( 'node_id', 'key'
, 'value_int'
, 'value_flt'
, 'value_utc'
, 'value_str'
, 'value_txt' ),
data = _nodes_hyperdata_generator(corpus),
)
from gargantext.models import Node, NodeNgram, NodeNgramNgram
from gargantext.models import Node, NodeNgram, NodeNgramNgram, \
NodeHyperdata
from gargantext.util.lists import WeightedMatrix
from gargantext.util.db import session, aliased, func
from gargantext.util.db_cache import cache
from gargantext.constants import DEFAULT_COOC_THRESHOLD
from datetime import datetime
def compute_coocs(corpus,
def compute_coocs( corpus,
overwrite_id = None,
threshold = DEFAULT_COOC_THRESHOLD,
mainlist_id = None,
stoplist_id = None,
start = None,
end = None,
symmetry_filter = True):
"""
Count how often some extracted terms appear
......@@ -19,10 +23,10 @@ def compute_coocs(corpus,
node_id | ngram_id | weight ngram1_id | ngram2_id | score |
--------+----------+-------- ----------+-----------+-------+
MYDOCA | 487 | 1 => 487 | 294 | 2 |
MYDOCA | 294 | 3
MYDOCB | 487 | 1
MYDOCB | 294 | 4
MyDocA | 487 | 1 => 487 | 294 | 2 |
MyDocA | 294 | 3
MyDocB | 487 | 1
MyDocB | 294 | 4
Fill that info in DB:
- a *new* COOCCURRENCES node
......@@ -40,6 +44,10 @@ def compute_coocs(corpus,
- mainlist_id: mainlist to constrain the input ngrams
- stoplist_id: stoplist for filtering input ngrams
(normally unnecessary if a mainlist is provided)
- start, end: provide one or both temporal limits to filter on doc date
NB the expected type of parameter value is datetime.datetime
(string is also possible but format must follow
this convention: "2001-01-01" aka "%Y-%m-%d")
(deprecated parameters)
- field1,2: allowed to count other things than ngrams (eg tags) but no use case at present
......@@ -69,7 +77,6 @@ def compute_coocs(corpus,
# - TODO add grouped element's values in grouping 'chief ngram'
# - TODO cvalue_id: allow a metric as additional input filter
# - TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
# - TODO start, end : filter on document date
# - TODO weighted: if False normal cooc to be saved as result
# if True weighted cooc (experimental)
......@@ -128,6 +135,42 @@ def compute_coocs(corpus,
.filter( ~ x2.ngram_id.in_(stop_subquery) )
)
if start:
if isinstance(start, datetime):
start_str = start.strftime("%Y-%m-%d %H:%M:%S")
else:
start_str = str(start)
# doc_ids matching this limit
starttime_subquery = (session
.query(NodeHyperdata.node_id)
.filter(NodeHyperdata.key=="publication_date")
.filter(NodeHyperdata.value_str >= start_str)
.subquery()
)
# direct use of str comparison op because there is consistency b/w
# sql alpha sort and chrono sort *in this format %Y-%m-%d %H:%M:%S*
# the filtering by start limit
coocs_query = coocs_query.filter(x1.node_id.in_(starttime_subquery))
if end:
if isinstance(end, datetime):
end_str = end.strftime("%Y-%m-%d %H:%M:%S")
else:
end_str = str(end)
endtime_subquery = (session
.query(NodeHyperdata.node_id)
.filter(NodeHyperdata.key=="publication_date")
.filter(NodeHyperdata.value_str <= end_str)
.subquery()
)
# the filtering by end limit
coocs_query = coocs_query.filter(x1.node_id.in_(endtime_subquery))
if symmetry_filter:
# 1 filtre tenant en compte de la symétrie
# -> réduit le travail de moitié !!
......@@ -167,7 +210,7 @@ def compute_coocs(corpus,
# 5) SAVE
# --------
# saving the parameters of the analysis in the Node JSON
new_hyperdata = { 'corpus': corpus.id,
new_hyperdata = { 'corpus' : corpus.id,
'threshold': threshold }
if overwrite_id:
# overwrite pre-existing id
......
......@@ -6,6 +6,7 @@ from gargantext.util.ngramsextractors import ngramsextractors
from collections import defaultdict
from re import sub
from gargantext.util.scheduling import scheduled
def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
print('INTEGRATE')
......
from gargantext.util.http import *
from gargantext.util.db import *
from gargantext.util.db_cache import *
from gargantext.models import *
from gargantext.constants import *
from gargantext.models import Node, Ngram, NodeNgram, NodeNodeNgram
from gargantext.constants import NODETYPES
from gargantext.util.db import session, delete, func
from gargantext.util.db_cache import cache, or_
from gargantext.util.validation import validate
from gargantext.util.http import ValidationException, APIView \
, get_parameters, JsonHttpResponse, Http404
from collections import defaultdict
......@@ -71,22 +73,108 @@ class NodeListResource(APIView):
]
})
def post(self, request):
"""Create a new node.
NOT IMPLEMENTED
"""
def delete(self, request):
"""Removes the list of nodes corresponding to the query.
WARNING! THIS IS TOTALLY UNTESTED!!!!!
TODO : Should be a delete method!
"""
parameters, query, count = _query_nodes(request)
query.delete()
parameters = get_parameters(request)
parameters = validate(parameters, {'ids': list} )
try :
node_ids = [int(n) for n in parameters['ids'].split(',')]
except :
raise ValidationException('"ids" needs integers separated by comma.')
result = session.execute(
delete(Node).where(Node.id.in_(node_ids))
)
session.commit()
return JsonHttpResponse({
'parameters': parameters,
'count': count,
}, 200)
return JsonHttpResponse({'deleted': result.rowcount})
class NodeListHaving(APIView):
'''
Gives a list of nodes according to its score which is related
to some specific ngrams.
TODO: implement other options (offset)
Simple implementation:
Takes IDs of corpus and ngram and returns list of relevent documents in json format
according to TFIDF score (order is decreasing).
'''
def get(self, request, corpus_id):
parameters = get_parameters(request)
parameters = validate(parameters, {'score': str, 'ngram_ids' : list} )
try :
ngram_ids = [int(n) for n in parameters['ngram_ids'].split(',')]
except :
raise ValidationException('"ngram_ids" needs integers separated by comma.')
limit=5
nodes_list = []
corpus = session.query(Node).filter(Node.id==corpus_id).first()
tfidf_id = ( session.query( Node.id )
.filter( Node.typename == "TFIDF-CORPUS"
, Node.parent_id == corpus.id
)
.first()
)
tfidf_id = tfidf_id[0]
print(tfidf_id)
# request data
nodes_query = (session
.query(Node, func.sum(NodeNodeNgram.score))
.join(NodeNodeNgram, NodeNodeNgram.node2_id == Node.id)
.filter(NodeNodeNgram.node1_id == tfidf_id)
.filter(Node.typename == 'DOCUMENT', Node.parent_id== corpus.id)
.filter(or_(*[NodeNodeNgram.ngram_id==ngram_id for ngram_id in ngram_ids]))
.group_by(Node)
.order_by(func.sum(NodeNodeNgram.score).desc())
.limit(limit)
)
# print("\n")
# print("in TFIDF:")
# print("\tcorpus_id:",corpus_id)
# convert query result to a list of dicts
# if nodes_query is None:
# print("TFIDF error, juste take sums")
# nodes_query = (session
# .query(Node, func.sum(NodeNgram.weight))
# .join(NodeNgram, NodeNgram.node_id == Node.id)
# .filter(Node.parent_id == corpus_id)
# .filter(Node.typename == 'DOCUMENT')
# .filter(or_(*[NodeNgram.ngram_id==ngram_id for ngram_id in ngram_ids]))
# .group_by(Node)
# .order_by(func.sum(NodeNgram.weight).desc())
# .limit(limit)
# )
for node, score in nodes_query:
print(node,score)
print("\t corpus:",corpus_id,"\t",node.name)
node_dict = {
'id': node.id,
'score': score,
}
for key in ('title', 'publication_date', 'journal', 'authors', 'fields'):
if key in node.hyperdata:
node_dict[key] = node.hyperdata[key]
nodes_list.append(node_dict)
return JsonHttpResponse(nodes_list)
class NodeResource(APIView):
......@@ -104,7 +192,6 @@ class NodeResource(APIView):
parameters, query, count = _query_nodes(request, node_id)
if not len(query):
raise Http404()
from sqlalchemy import delete
result = session.execute(
delete(Node).where(Node.id == node_id)
)
......@@ -177,3 +264,6 @@ class CorpusFacet(APIView):
# // if subfield not in corpus.aggs:
# // corpus.aggs[subfield] = xcounts
return (xcounts, total)
......@@ -5,10 +5,11 @@ from . import ngramlists
urlpatterns = [
url(r'^nodes$', nodes.NodeListResource.as_view()),
url(r'^nodes/(\d+)$', nodes.NodeResource.as_view()),
url(r'^nodes$' , nodes.NodeListResource.as_view()),
url(r'^nodes/(\d+)$' , nodes.NodeResource.as_view()),
url(r'^nodes/(\d+)/facets$', nodes.CorpusFacet.as_view()),
url(r'^nodes/(\d+)/facets$' , nodes.CorpusFacet.as_view()),
url(r'^nodes/(\d+)/having$' , nodes.NodeListHaving.as_view()),
# add or remove ngram from a list
# ex: add <=> PUT ngramlists/change?list=42&ngrams=1,2
......@@ -23,5 +24,4 @@ urlpatterns = [
# - an optional grouplist)
url(r'^ngramlists/family$', ngramlists.ListFamily.as_view()),
]
......@@ -7,21 +7,9 @@ def login(request):
"""Performs user login
"""
auth.logout(request)
# if the user wants to access the login form
if request.method == 'GET':
additional_context = {}
# if for exemple: auth/?next=/project/5/corpus/554/document/556/
# => we'll forward ?next="..." into template with form
if 'next' in request.GET:
additional_context = {'next_page':request.GET['next']}
return render(
template_name = 'pages/auth/login.html',
request = request,
context = additional_context,
)
# if the user send her authentication data to the page
elif request.method == "POST":
if request.method == "POST":
# /!\ pass is sent clear in POST data: use SSL
user = auth.authenticate(
username = request.POST['username'],
......@@ -35,6 +23,19 @@ def login(request):
else:
return redirect('/projects/')
# if the user wants to access the login form
additional_context = {}
# if for exemple: auth/?next=/project/5/corpus/554/document/556/
# => we'll forward ?next="..." into template with form
if 'next' in request.GET:
additional_context = {'next_page':request.GET['next']}
return render(
template_name = 'pages/auth/login.html',
request = request,
context = additional_context,
)
def logout(request):
"""Logout the user, and redirect to main page
......
......@@ -24,23 +24,21 @@ def _get_user_project_corpus(request, project_id, corpus_id):
@requires_auth
def corpus(request, project_id, corpus_id):
def docs_by_titles(request, project_id, corpus_id):
authorized, user, project, corpus = _get_user_project_corpus(request, project_id, corpus_id)
if not authorized:
return HttpResponseForbidden()
# response!
return render(
template_name = 'pages/corpora/corpus.html',
template_name = 'pages/corpora/titles.html',
request = request,
context = {
'debug': DEBUG,
'user': user,
'date': datetime.now(),
'project': project,
'corpus': corpus,
# 'processing': corpus['extracted'],
# 'number': number,
'view': 'documents'
'view': 'titles',
'user': request.user
},
)
......@@ -74,3 +72,4 @@ def docs_by_journals(request, project_id, corpus_id):
'view': 'journals'
},
)
......@@ -6,7 +6,7 @@ from gargantext.models import *
from gargantext.constants import *
from gargantext.util.scheduling import scheduled
from gargantext.util.toolchain import parse_extract
from gargantext.util.toolchain import parse_extract_indexhyperdata
from datetime import datetime
from collections import defaultdict
......@@ -94,8 +94,9 @@ def project(request, project_id):
)
session.add(corpus)
session.commit()
# parse_extract: fileparsing -> ngram extraction -> lists
scheduled(parse_extract)(corpus.id)
scheduled(parse_extract_indexhyperdata)(corpus.id)
# corpora within this project
corpora = project.children('CORPUS').all()
......
......@@ -20,7 +20,7 @@ urlpatterns = [
url(r'^projects/(\d+)/?$', projects.project),
# corpora
url(r'^projects/(\d+)/corpora/(\d+)/?$', corpora.corpus),
url(r'^projects/(\d+)/corpora/(\d+)/?$', corpora.docs_by_titles),
url(r'^projects/(\d+)/corpora/(\d+)/chart/?$', corpora.chart),
# corpus by journals
......@@ -28,4 +28,5 @@ urlpatterns = [
# terms table for the corpus
url(r'^projects/(\d+)/corpora/(\d+)/terms/?$', terms.ngramtable),
]
Module Graph Explorer: from text to graph.
Maintainer: If you see bugs, please report to team@gargantext.org
# Article coming soon
from gargantext.util.db import session
from gargantext.models.ngrams import Ngram
from collections import defaultdict
from networkx.readwrite import json_graph
def filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2):
# Data are stored in a dict(), (== hashmap by default for Python)
data = dict()
if type == "node_link":
nodesB_dict = {}
for node_id in G.nodes():
#node,type(labels[node])
G.node[node_id]['pk'] = ids[node_id][1]
nodesB_dict [ ids[node_id][1] ] = True
# TODO the query below is not optimized (do it do_distance).
the_label = session.query(Ngram.terms).filter(Ngram.id==node_id).first()
the_label = ", ".join(the_label)
G.node[node_id]['label'] = the_label
G.node[node_id]['size'] = weight[node_id]
G.node[node_id]['type'] = ids[node_id][0].replace("ngrams","terms")
G.node[node_id]['attributes'] = { "clust_default": partition[node_id]} # new format
# G.add_edge(node, "cluster " + str(partition[node]), weight=3)
links = []
i=1
if bridgeness > 0:
com_link = defaultdict(lambda: defaultdict(list))
com_ids = defaultdict(list)
for k, v in partition.items():
com_ids[v].append(k)
for e in G.edges_iter():
s = e[0]
t = e[1]
weight = G[ids[s][1]][ids[t][1]]["weight"]
if bridgeness < 0:
info = { "s": ids[s][1]
, "t": ids[t][1]
, "w": weight
}
links.append(info)
else:
if partition[s] == partition[t]:
info = { "s": ids[s][1]
, "t": ids[t][1]
, "w": weight
}
links.append(info)
if bridgeness > 0:
if partition[s] < partition[t]:
com_link[partition[s]][partition[t]].append((s,t,weight))
if bridgeness > 0:
for c1 in com_link.keys():
for c2 in com_link[c1].keys():
index = round(bridgeness*len(com_link[c1][c2]) / (len(com_ids[c1]) + len(com_ids[c2])))
#print((c1,len(com_ids[c1])), (c2,len(com_ids[c2])), index)
if index > 0:
for link in sorted(com_link[c1][c2], key=lambda x: x[2], reverse=True)[:index]:
#print(c1, c2, link[2])
info = {"s": link[0], "t": link[1], "w": link[2]}
links.append(info)
B = json_graph.node_link_data(G)
B["links"] = []
B["links"] = links
if field1 == field2 == 'ngrams' :
data["nodes"] = B["nodes"]
data["links"] = B["links"]
else:
A = get_graphA( "journal" , nodesB_dict , B["links"] , corpus )
print("#nodesA:",len(A["nodes"]))
print("#linksAA + #linksAB:",len(A["links"]))
print("#nodesB:",len(B["nodes"]))
print("#linksBB:",len(B["links"]))
data["nodes"] = A["nodes"] + B["nodes"]
data["links"] = A["links"] + B["links"]
print(" total nodes :",len(data["nodes"]))
print(" total links :",len(data["links"]))
print("")
elif type == "adjacency":
for node in G.nodes():
try:
#node,type(labels[node])
#G.node[node]['label'] = node
G.node[node]['name'] = node
#G.node[node]['size'] = weight[node]
G.node[node]['group'] = partition[node]
#G.add_edge(node, partition[node], weight=3)
except Exception as error:
print("error02: ",error)
data = json_graph.node_link_data(G)
elif type == 'bestpartition':
return(partition)
return(data)
from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram, \
NodeHyperdata
from gargantext.util.db import session, aliased, bulk_insert, func
from gargantext.util.lists import WeightedMatrix, UnweightedList, Translations
from sqlalchemy import desc, asc, or_, and_
#import inspect
import datetime
def countCooccurrences( corpus=None
, field1='ngrams' , field2='ngrams'
, start=None , end=None
, mapList_id=None , groupList_id=None
, n_min=1, n_max=None , limit=1000
, coocNode_id=None , reset=True
, isMonopartite=True , threshold = 3):
'''
Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
For the moment list of paramters are not supported because, lists need to
be merged before.
corpus :: Corpus
mapList_id :: Int
groupList_id :: Int
For the moment, start and end are simple, only year is implemented yet
start :: TimeStamp -- example: '2010-05-30 02:00:00+02'
end :: TimeStamp
limit :: Int
'''
# TODO : add hyperdata here
# Security test
field1,field2 = str(field1), str(field2)
# Get node
if not coocNode_id:
coocNode_id0 = ( session.query( Node.id )
.filter( Node.typename == "COOCCURRENCES"
, Node.name == "GRAPH EXPLORER"
, Node.parent_id == corpus.id
)
.first()
)
if not coocNode_id:
coocNode = corpus.add_child(
typename = "COOCCURRENCES",
name = "GRAPH EXPLORER COOC (in:%s)" % corpus.id
)
session.add(coocNode)
session.commit()
coocNode_id = coocNode.id
else :
coocNode_id = coocNode_id[0]
if reset == True :
session.query( NodeNgramNgram ).filter( NodeNgramNgram.node_id == coocNode_id ).delete()
session.commit()
NodeNgramX = aliased(NodeNgram)
# Simple Cooccurrences
cooc_score = func.count(NodeNgramX.node_id).label('cooc_score')
# A kind of Euclidean distance cooccurrences
#cooc_score = func.sqrt(func.sum(NodeNgramX.weight * NodeNgramY.weight)).label('cooc_score')
if isMonopartite :
NodeNgramY = aliased(NodeNgram)
cooc_query = (session.query( NodeNgramX.ngram_id
, NodeNgramY.ngram_id
, cooc_score
)
.join( Node
, Node.id == NodeNgramX.node_id
)
.join( NodeNgramY
, NodeNgramY.node_id == Node.id
)
.filter( Node.parent_id==corpus.id
, Node.typename=="DOCUMENT"
)
)
else :
NodeNgramY = aliased(NodeNgram)
cooc_query = (session.query( NodeHyperdataNgram.ngram_id
, NodeNgramY.ngram_id
, cooc_score
)
.join( Node
, Node.id == NodeHyperdataNgram.node_id
)
.join( NodeNgramY
, NodeNgramY.node_id == Node.id
)
.join( Hyperdata
, Hyperdata.id == NodeHyperdataNgram.hyperdata_id
)
.filter( Node.parent_id == corpus.id
, Node.typename == "DOCUMENT"
)
.filter( Hyperdata.name == field1 )
)
# Size of the ngrams between n_min and n_max
if n_min is not None or n_max is not None:
if isMonopartite:
NgramX = aliased(Ngram)
cooc_query = cooc_query.join ( NgramX
, NgramX.id == NodeNgramX.ngram_id
)
NgramY = aliased(Ngram)
cooc_query = cooc_query.join ( NgramY
, NgramY.id == NodeNgramY.ngram_id
)
if n_min is not None:
cooc_query = (cooc_query
.filter(NgramY.n >= n_min)
)
if isMonopartite:
cooc_query = cooc_query.filter(NgramX.n >= n_min)
if n_max is not None:
cooc_query = (cooc_query
.filter(NgramY.n >= n_min)
)
if isMonopartite:
cooc_query = cooc_query.filter(NgramX.n >= n_min)
# Cooc between the dates start and end
if start is not None:
#date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S")
# TODO : more complexe date format here.
date_start = datetime.datetime.strptime (str(start), "%Y-%m-%d")
date_start_utc = date_start.strftime("%Y-%m-%d %H:%M:%S")
Start=aliased(NodeHyperdata)
StartFormat = aliased(Hyperdata)
cooc_query = (cooc_query.join( Start
, Start.node_id == Node.id
)
.join( StartFormat
, StartFormat.id == Start.hyperdata_id
)
.filter( StartFormat.name == 'publication_date')
.filter( Start.value_datetime >= date_start_utc)
)
if end is not None:
# TODO : more complexe date format here.
date_end = datetime.datetime.strptime (str(end), "%Y-%m-%d")
date_end_utc = date_end.strftime("%Y-%m-%d %H:%M:%S")
End=aliased(NodeHyperdata)
EndFormat = aliased(Hyperdata)
cooc_query = (cooc_query.join( End
, End.node_id == Node.id
)
.join( EndFormat
, EndFormat.id == End.hyperdata_id
)
.filter( EndFormat.name == 'publication_date' )
.filter( End.value_datetime <= date_end_utc )
)
if isMonopartite:
# Cooc is symetric, take only the main cooccurrences and cut at the limit
cooc_query = cooc_query.filter(NodeNgramX.ngram_id < NodeNgramY.ngram_id)
cooc_query = cooc_query.having(cooc_score > threshold)
if isMonopartite:
cooc_query = cooc_query.group_by(NodeNgramX.ngram_id, NodeNgramY.ngram_id)
else:
cooc_query = cooc_query.group_by(NodeHyperdataNgram.ngram_id, NodeNgramY.ngram_id)
# Order according some scores
cooc_query = cooc_query.order_by(desc('cooc_score'))
matrix = WeightedMatrix(cooc_query)
mapList = UnweightedList( mapList_id )
group_list = Translations ( groupList_id )
cooc = matrix & (mapList * group_list)
cooc.save(coocNode_id)
return(coocNode_id)
from gargantext.models import Node, NodeNgram, NodeNgramNgram, \
NodeHyperdata
from gargantext.util.db import session, aliased
from graphExplorer.louvain import best_partition
from copy import copy
from collections import defaultdict
from math import log,sqrt
#from operator import itemgetter
import math
import numpy as np
import pandas as pd
import networkx as nx
def clusterByDistances( cooc_id
, field1=None, field2=None
, distance='conditional'):
'''
do_distance :: Int -> (Graph, Partition, {ids}, {weight})
'''
# implicit global session
authorized = ['conditional', 'distributional', 'cosine']
if distance not in authorized:
distance = 'conditional'
matrix = defaultdict(lambda : defaultdict(float))
ids = defaultdict(lambda : defaultdict(int))
labels = dict()
weight = dict()
Cooc = aliased(NodeNgramNgram)
query = session.query(Cooc).filter(Cooc.node_id==cooc_id).all()
for cooc in query:
matrix[cooc.ngram1_id][cooc.ngram2_id] = cooc.weight
matrix[cooc.ngram2_id][cooc.ngram1_id] = cooc.weight
ids[cooc.ngram1_id] = (field1, cooc.ngram1_id)
ids[cooc.ngram2_id] = (field2, cooc.ngram2_id)
weight[cooc.ngram1_id] = weight.get(cooc.ngram1_id, 0) + cooc.weight
weight[cooc.ngram2_id] = weight.get(cooc.ngram2_id, 0) + cooc.weight
x = pd.DataFrame(matrix).fillna(0)
if distance == 'conditional':
x = x / x.sum(axis=1)
#y = y / y.sum(axis=0)
xs = x.sum(axis=1) - x
ys = x.sum(axis=0) - x
# top inclus ou exclus
n = ( xs + ys) / (2 * (x.shape[0] - 1))
# top generic or specific
m = ( xs - ys) / (2 * (x.shape[0] - 1))
n = n.sort(inplace=False)
m = m.sort(inplace=False)
nodes_included = 500 #int(round(size/20,0))
#nodes_excluded = int(round(size/10,0))
nodes_specific = 500 #int(round(size/10,0))
#nodes_generic = int(round(size/10,0))
# TODO use the included score for the node size
n_index = pd.Index.intersection(x.index, n.index[:nodes_included])
# Generic:
#m_index = pd.Index.intersection(x.index, m.index[:nodes_generic])
# Specific:
m_index = pd.Index.intersection(x.index, m.index[-nodes_specific:])
#m_index = pd.Index.intersection(x.index, n.index[:nodes_included])
x_index = pd.Index.union(n_index, m_index)
xx = x[list(x_index)].T[list(x_index)]
# Removing unconnected nodes
xxx = xx.values
threshold = min(xxx.max(axis=1))
matrix_filtered = np.where(xxx >= threshold, xxx, 0)
#matrix_filtered = matrix_filtered.resize((90,90))
G = nx.from_numpy_matrix(np.matrix(matrix_filtered))
G = nx.relabel_nodes(G, dict(enumerate([ ids[id_][1] for id_ in list(xx.columns)])))
elif distance == 'cosine':
scd = defaultdict(lambda : defaultdict(int))
for i in matrix.keys():
for j in matrix.keys():
numerator = sum(
[
matrix[i][k] * matrix[j][k]
for k in matrix.keys()
if i != j and k != i and k != j
]
)
denominator = sqrt(
sum([
matrix[i][k]
for k in matrix.keys()
if k != i and k != j #and matrix[i][k] > 0
])
*
sum([
matrix[i][k]
for k in matrix.keys()
if k != i and k != j #and matrix[i][k] > 0
])
)
try:
scd[i][j] = numerator / denominator
except Exception as error:
scd[i][j] = 0
minmax = min([ max([ scd[i][j] for i in scd.keys()]) for j in scd.keys()])
G = nx.DiGraph()
G.add_edges_from(
[
(i, j, {'weight': scd[i][j]})
for i in scd.keys() for j in scd.keys()
if i != j and scd[i][j] > minmax and scd[i][j] > scd[j][i]
]
)
elif distance == 'distributional':
mi = defaultdict(lambda : defaultdict(int))
total_cooc = x.sum().sum()
for i in matrix.keys():
si = sum([matrix[i][j] for j in matrix[i].keys() if i != j])
for j in matrix[i].keys():
sj = sum([matrix[j][k] for k in matrix[j].keys() if j != k])
if i!=j :
mi[i][j] = log( matrix[i][j] / ((si * sj) / total_cooc) )
r = defaultdict(lambda : defaultdict(int))
for i in matrix.keys():
for j in matrix.keys():
sumMin = sum(
[
min(mi[i][k], mi[j][k])
for k in matrix.keys()
if i != j and k != i and k != j and mi[i][k] > 0
]
)
sumMi = sum(
[
mi[i][k]
for k in matrix.keys()
if k != i and k != j and mi[i][k] > 0
]
)
try:
r[i][j] = sumMin / sumMi
except Exception as error:
r[i][j] = 0
# Need to filter the weak links, automatic threshold here
minmax = min([ max([ r[i][j] for i in r.keys()]) for j in r.keys()])
G = nx.DiGraph()
G.add_edges_from(
[
(i, j, {'weight': r[i][j]})
for i in r.keys() for j in r.keys()
if i != j and r[i][j] > minmax and r[i][j] > r[j][i]
]
)
# degree_max = max([(n, d) for n,d in G.degree().items()], key=itemgetter(1))[1]
# nodes_to_remove = [n for (n,d) in G.degree().items() if d <= round(degree_max/2)]
# G.remove_nodes_from(nodes_to_remove)
# Removing too connected nodes (find automatic way to do it)
#edges_to_remove = [ e for e in G.edges_iter() if
# nodes_to_remove = [n for n in degree if degree[n] <= 1]
# G.remove_nodes_from(nodes_to_remove)
def getWeight(item):
return item[1]
#
# node_degree = sorted(G.degree().items(), key=getWeight, reverse=True)
# #print(node_degree)
# nodes_too_connected = [n[0] for n in node_degree[0:(round(len(node_degree)/5))]]
#
# for n in nodes_too_connected:
# n_edges = list()
# for v in nx.neighbors(G,n):
# #print((n, v), G[n][v]['weight'], ":", (v,n), G[v][n]['weight'])
# n_edges.append(((n, v), G[n][v]['weight']))
#
# n_edges_sorted = sorted(n_edges, key=getWeight, reverse=True)
# #G.remove_edges_from([ e[0] for e in n_edges_sorted[round(len(n_edges_sorted)/2):]])
# #G.remove_edges_from([ e[0] for e in n_edges_sorted[(round(len(nx.neighbors(G,n))/3)):]])
# G.remove_edges_from([ e[0] for e in n_edges_sorted[10:]])
G.remove_nodes_from(nx.isolates(G))
partition = best_partition(G.to_undirected())
return(G,partition,ids,weight)
# Gargantext lib
from gargantext.util.db import session
from gargantext.util.http import JsonHttpResponse
from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram
#from gargantext.util.toolchain.ngram_coocs import compute_coocs
from graphExplorer.cooccurrences import countCooccurrences
from graphExplorer.distances import clusterByDistances
from graphExplorer.bridgeness import filterByBridgeness
# Prelude lib
from copy import copy, deepcopy
from collections import defaultdict
from sqlalchemy.orm import aliased
# Math/Graph lib
import math
import pandas as pd
import numpy as np
import networkx as nx
def get_graph( request=None , corpus=None
, field1='ngrams' , field2='ngrams'
, mapList_id = None , groupList_id = None
, cooc_id=None , type='node_link'
, start=None , end=None
, threshold=1
, distance='conditional'
, isMonopartite=True # By default, we compute terms/terms graph
, bridgeness=5
#, size=1000
):
'''
Get_graph : main steps:
1) count Cooccurrences (function countCooccurrences)
main parameters: threshold
2) filter and cluster By Distances (function clusterByDistances)
main parameter: distance
3) filter By Bridgeness (filter By Bridgeness)
main parameter: bridgness
4) format the graph (formatGraph)
main parameter: format_
'''
if cooc_id == None:
cooc_id = countCooccurrences( corpus=corpus
#, field1="ngrams", field2="ngrams"
, start=start , end =end
, mapList_id=mapList_id , groupList_id=groupList_id
, isMonopartite=True , threshold = threshold
#, limit=size
)
G, partition, ids, weight = clusterByDistances ( cooc_id
, field1="ngrams", field2="ngrams"
, distance=distance
)
data = filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2)
return data
This diff is collapsed.
#from rest_framework.authentication import SessionAuthentication, BasicAuthentication
from gargantext.util.db import session
from gargantext.models.nodes import Node
from graphExplorer.graph import get_graph
from gargantext.util.http import APIView, APIException\
, JsonHttpResponse, requires_auth
# TODO check authentication
class Graph(APIView):
'''
REST part for graphs.
'''
def get(self, request, project_id, corpus_id):
'''
Graph.get :: Get graph data as REST api.
Get all the parameters first
graph?field1=ngrams&field2=ngrams&
graph?field1=ngrams&field2=ngrams&start=''&end=''
'''
# Get the node we are working with
corpus = session.query(Node).filter(Node.id==corpus_id).first()
# Get all the parameters in the URL
field1 = str(request.GET.get ('field1' , 'ngrams' ))
field2 = str(request.GET.get ('field2' , 'ngrams' ))
start = request.GET.get ('start' , None )
end = request.GET.get ('end' , None )
mapList_id = int(request.GET.get ('mapList' , 0 ))
groupList_id = int(request.GET.get ('groupList' , 0 ))
threshold = int(request.GET.get ('threshold' , 1 ))
bridgeness = int(request.GET.get ('bridgeness', -1 ))
format_ = str(request.GET.get ('format' , 'json' ))
type_ = str(request.GET.get ('type' , 'node_link' ))
distance = str(request.GET.get ('distance' , 'conditional'))
# Get default value if no map list
if mapList_id == 0 :
mapList_id = ( session.query ( Node.id )
.filter( Node.typename == "MAPLIST"
, Node.parent_id == corpus.id
)
.first()
)
mapList_id = mapList_id[0]
if mapList_id == None :
raise ValueError("MAPLIST node needed for cooccurrences")
# Get default value if no group list
if groupList_id == 0 :
groupList_id = ( session.query ( Node.id )
.filter( Node.typename == "GROUPLIST"
, Node.parent_id == corpus.id
)
.first()
)
groupList_id = groupList_id[0]
if groupList_id == None :
raise ValueError("GROUPLIST node needed for cooccurrences")
# Chec the options
accepted_field1 = ['ngrams', 'journal', 'source', 'authors']
accepted_field2 = ['ngrams', ]
options = ['start', 'end', 'threshold', 'distance' ]
if field1 in accepted_field1 :
if field2 in accepted_field2 :
if start is not None and end is not None :
data = get_graph( corpus=corpus
#, field1=field1 , field2=field2
, mapList_id = mapList_id , groupList_id = groupList_id
, start=start , end=end
, threshold =threshold , distance=distance
)
else:
data = get_graph( corpus = corpus
#, field1=field1, field2=field2
, mapList_id = mapList_id , groupList_id = groupList_id
, threshold = threshold
, distance = distance
, bridgeness = bridgeness
)
if format_ == 'json':
return JsonHttpResponse(data)
else:
return JsonHttpResponse({
'Warning USAGE' : 'One field for each range:'
, 'field1' : accepted_field1
, 'field2' : accepted_field2
, 'options': options
})
mv /srv/gargantext/static/js/tina* .
from django.conf.urls import patterns, url
from graphExplorer import views
# /!\ urls patterns here are *without* the trailing slash
urlpatterns = patterns('',
url(r'^register/$', views.Register.as_view()), # Register
url(r'^login/$', views.Login.as_view()), # Login
)
from gargantext.util.http import *
from gargantext.util.db import *
from gargantext.util.db_cache import cache
from gargantext.models import *
from gargantext.constants import *
from gargantext.settings import *
from datetime import datetime
@requires_auth
def explorer(request, project_id, corpus_id):
'''
Graph explorer, also known as TinaWebJS, using SigmaJS.
Nodes are ngrams (from title or abstract or journal name.
Links represent proximity measure.
'''
# we pass our corpus
corpus = cache.Node[corpus_id]
# and the project just for project.id in corpusBannerTop
project = cache.Node[project_id]
graphurl = "projects/" + str(project_id) + "/corpora/" + str(corpus_id) + "/node_link.json"
# rendered page : journals.html
return render(
template_name = 'graphExplorer/explorer.html',
request = request,
context = {
'debug' : settings.DEBUG,
'request' : request,
'user' : request.user,
'date' : datetime.now(),
'project' : project,
'corpus' : corpus,
#'list_id' : maplist.id,\
'graphfile' : graphurl,\
'view' : 'graph'
},
)
#!/bin/dash
# TODO do apt-get install --force-yes --force-yes
#postgresql3.4-server-dev
#+libxml2-dev
sudo apt-get install --force-yes postgresql
sudo apt-get install --force-yes postgresql-contrib
sudo apt-get install --force-yes rabbitmq-server
sudo apt-get install --force-yes tmux
sudo apt-get install --force-yes uwsgi uwsgi-plugin-python3
#apt-get install --force-yes python-virtualenv
sudo apt-get install --force-yes libpng12-dev
sudo apt-get install --force-yes libpng-dev
sudo apt-get install --force-yes libfreetype6-dev
sudo apt-get install --force-yes python-dev
sudo apt-get install --force-yes libpq-dev
sudo apt-get install --force-yes libpq-dev
#apt-get build-dep python-matplotlib
#apt-get install --force-yes python-matplotlib
#Paquets Debian a installer
# easy_install --force-yes -U distribute (matplotlib)
#lxml
sudo apt-get install --force-yes libffi-dev
sudo apt-get install --force-yes libxml2-dev
sudo apt-get install --force-yes libxslt1-dev
# ipython readline
sudo apt-get install --force-yes libncurses5-dev
sudo apt-get install --force-yes pandoc
# scipy:
sudo apt-get install --force-yes gfortran
sudo apt-get install --force-yes libopenblas-dev
sudo apt-get install --force-yes liblapack-dev
#nlpserver
sudo apt-get install --force-yes libgflags-dev
sudo apt-get install --force-yes libgoogle-glog-dev
# MElt
# soon
## SERVER Configuration
# server configuration
sudo apt-get install --force-yes nginx
# UWSGI with pcre support
sudo apt-get install --force-yes libpcre3 libpcre3-dev
sudo apt-get install --force-yes python3-pip
#pip3 install --force-yes uwsgi
#!/bin/bash
#MAINTAINER ISCPIF <alexandre.delanoe@iscpif.fr>
git clone ssh://gitolite@delanoe.org:1979/gargantext /srv/gargantext \
&& cd /srv/gargantext \
&& git fetch origin refactoring-alex \
&& git checkout refactoring-alex
cd /srv/gargantext/install \
&& /usr/bin/virtualenv --py=/usr/bin/python3.5 /srv/env_3-5 \
&& /bin/bash -c 'source /srv/env_3-5/bin/activate' \
&& /bin/bash -c '/srv/env_3-5/bin/pip install git+https://github.com/zzzeek/sqlalchemy.git@rel_1_1' \
&& /bin/bash -c '/srv/env_3-5/bin/pip install -r /srv/gargantext/install/python/requirements.txt' \
## INSTALL MAIN DEPENDENCIES
cd /tmp && wget http://dl.gargantext.org/gargantext_lib.tar.bz2 \
&& tar xvjf gargantext_lib.tar.bz2 -o /srv/gargantext_lib \
&& chown -R gargantua:gargantua /srv/gargantext_lib
## End of configuration
## be sure that postgres is running
cd /srv/gargantext && /bin/bash -c 'source /srv/bin/env_3-5/bin/activate' \
&& /srv/gargantext/manage.py shell < /srv/gargantext/init.py
echo "Gargantua: END of the installation of Gargantext"
#!/bin/bash
# ## CONFIGURE POSTGRESQL
psql -c "CREATE user gargantua WITH PASSWORD 'C8kdcUrAQy66U'" && createdb -O gargantua gargandb
#!/bin/bash
#MAINTAINER ISCPIF <alexandre.delanoe@iscpif.fr>
apt-get update && \
apt-get install -y \
apt-utils ca-certificates locales \
sudo aptitude gcc g++ wget git postgresql-9.5 vim
### Configure timezone and locale
echo "Europe/Paris" > /etc/timezone && \
dpkg-reconfigure -f noninteractive tzdata && \
sed -i -e 's/# en_GB.UTF-8 UTF-8/en_GB.UTF-8 UTF-8/' /etc/locale.gen && \
sed -i -e 's/# fr_FR.UTF-8 UTF-8/fr_FR.UTF-8 UTF-8/' /etc/locale.gen && \
echo 'LANG="fr_FR.UTF-8"' > /etc/default/locale && \
dpkg-reconfigure --frontend=noninteractive locales && \
update-locale LANG=fr_FR.UTF-8
## PROD VERSION OF GARGANTEXt
apt-get install -y uwsgi nginx
### CREATE USER and adding it to sudo
## USER gargantua cannot not connect with password but SSH key
adduser --disabled-password --gecos "" gargantua \
&& adduser gargantua sudo \
&& echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
# addgroup gargantext here with specific users
## Install Database, main dependencies and Python
## (installing some Debian version before pip to get dependencies)
apt-get update && apt-get install -y \
postgresql-server-dev-9.5 libpq-dev libxml2 \
libxml2-dev xml-core libgfortran-5-dev \
virtualenv python3-virtualenv \
python3.4 python3.4-dev \
python3.5 python3.5-dev \
python3-six python3-numpy python3-setuptools \ # for numpy, pandas
python3-numexpr \ # for numpy performance
libxml2-dev libxslt-dev # for lxml
#if [[ -e "/srv/gargantext" ]]
#rm -rf /srv/gargantext /srv/env_3-5
for dir in "/srv/gargantext"\
"/srv/gargantext_lib"\
"/srv/env_3-5"\
"/var/www/gargantext"; do \
mkdir $dir
chown gargantua:gargantua $dir
done
echo "Root: END of the installation of Gargantext by Root."
Docker installation
For dev: cd dev and run install
Fro prod : install dev-version, cd prod and run install
#FROM debian:stretch
FROM gargantext
#MAINTAINER ISCPIF <alexandre.delanoe@iscpif.fr>
#
## Install docker.io
## Install sudo
## wget http://dl.gargantext.
## cd /srv/gargantext/install
## sudo docker build -t gargantext .
# docker run -i -t gargantext /bin/bash
USER root
# RUN apt-get update && \
# apt-get install -y \
# apt-utils ca-certificates locales \
# sudo aptitude gcc wget git postgresql-9.5 vim
#
# ## Configure timezone and locale
# RUN echo "Europe/Paris" > /etc/timezone && \
# dpkg-reconfigure -f noninteractive tzdata && \
# sed -i -e 's/# en_GB.UTF-8 UTF-8/en_GB.UTF-8 UTF-8/' /etc/locale.gen && \
# sed -i -e 's/# fr_FR.UTF-8 UTF-8/fr_FR.UTF-8 UTF-8/' /etc/locale.gen && \
# echo 'LANG="fr_FR.UTF-8"' > /etc/default/locale && \
# dpkg-reconfigure --frontend=noninteractive locales && \
# update-locale LANG=fr_FR.UTF-8
#
#
# RUN apt-get update \
# && apt-get install -y postgresql-server-dev-9.5 \
# libpq-dev libxml2 libxml2-dev xml-core libgfortran-5-dev
#
# # PROD VERSION OF GARGANTEXt
# # RUN apt-get install uwsgi nginx
#
#
# ## CREATE USER and adding it to sudo
# ## TODO ask user for password
# RUN adduser --disabled-password --gecos "" gargantua
# RUN apt-get install -y sudo && adduser gargantua sudo \
# && echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
#
# # Python dependencies
# # (installing Debian version before pip to get dependencies)
# # TODO: update it with requirements.txt
# RUN apt-get update && apt-get install -y \
# virtualenv python3-virtualenv \
# python3.4 python3.4-dev \
# python3.5 python3.5-dev \
# python3-six python3-numpy
#
# # Installing pip version of python libs
WORKDIR /home/gargantua
# FIXME : pip install -r all requirements does not work, need to split the list
#RUN wget http://dl.delanoe.org/requirements.txt \
# && /usr/bin/virtualenv --py=/usr/bin/python3.5 env_3-5 \
# && /bin/bash -c 'source env_3-5/bin/activate' \
# && /bin/bash -c 'env_3-5/bin/pip install git+https://github.com/zzzeek/sqlalchemy.git@rel_1_1' \
# && /bin/bash -c 'env_3-5/bin/pip install -r requirements.txt'
# chown gargantua
#
## CONFIGURE POSTGRESQL
#
#VOLUME ["/home/gargantua","/data/gargantext"]
## INSTALL MAIN DEPENDENCIES
#WORKDIR /srv
#RUN mkdir -p gargantext
# configure postgres here
# OK
USER postgres
RUN /etc/init.d/postgresql start &&\
psql -c "CREATE user gargantua WITH PASSWORD 'C8kdcUrAQy66U'" &&\
createdb -O gargantua gargandb
# GET CONFIG FILES
# USER gargantua
#RUN wget http://dl.delanoe.org/gargantext_big.txt -o /srv/gargantext_big.txt
#RUN cd /srv/ && git clone https://gogs.iscpif.fr/gargantext.git
# script pour peupler la base
# mount /srv
# execute
##################### INSTALLATION END #####################
# Expose the default port
#EXPOSE 54332
# Default port to execute the entrypoint (MongoDB)
#CMD ["--port 27017"]
# Set default container command
#ENTRYPOINT usr/bin/mongod
# ENTRYPOINT /etc/init.d/postgresql start
#/bin/bash
# Install Docker
# Debian/Ubuntu: apt-get install docker
# run turboparser port, with python 3.4
#docker run -d -p 8000:8000 -v /srv:/srv -t gargantext python /srv/gargantext/gargantext.py
# launch
#cd /srv/gargantext
#source /srv/env_3-5/bin/activate &&
#docker run -d -p 8000:8000 -v /srv:/srv -t gargantext python /srv/gargantext/gargantext.py
docker build -t gargantext .
# try bottleneck
amqp==1.4.9
anyjson==0.3.3
billiard==3.3.0.22 # multiprocessing fork
......@@ -21,8 +22,10 @@ python-dateutil==2.4.2
pytz==2015.7 # timezones
PyYAML==3.11
RandomWords==0.1.12
six==1.10.0
SQLAlchemy==1.1.0b1.dev0
ujson==1.35
umalqurra==0.2 # arabic calendars (?? why use ??)
wheel==0.29.0
pandas==0.18.0
networkx==1.11
six==1.10.0
#!/bin/bash
echo "Need to finish the dependencies. So soon... :)"
#!/bin/bash
FILE="/var/log/gargantext/celery/$(date +%Y%m%d-%H:%M:%S).log"
source /srv/gargantext_env_3.5/bin/activate
./manage.py celery worker -f $FILE
#!/bin/bash
FILE="/var/log/gargantext/uwsgi/$(date +%Y%m%d-%H:%M:%S).log"
#touch /var/log/gargantext/uwsgi/$FILE && sudo
uwsgi gargantext.ini --logto $FILE
/*
AngularJS v1.2.28
(c) 2010-2014 Google, Inc. http://angularjs.org
License: MIT
*/
(function(p,f,n){'use strict';f.module("ngCookies",["ng"]).factory("$cookies",["$rootScope","$browser",function(e,b){var c={},g={},h,k=!1,l=f.copy,m=f.isUndefined;b.addPollFn(function(){var a=b.cookies();h!=a&&(h=a,l(a,g),l(a,c),k&&e.$apply())})();k=!0;e.$watch(function(){var a,d,e;for(a in g)m(c[a])&&b.cookies(a,n);for(a in c)d=c[a],f.isString(d)||(d=""+d,c[a]=d),d!==g[a]&&(b.cookies(a,d),e=!0);if(e)for(a in d=b.cookies(),c)c[a]!==d[a]&&(m(d[a])?delete c[a]:c[a]=d[a])});return c}]).factory("$cookieStore",
["$cookies",function(e){return{get:function(b){return(b=e[b])?f.fromJson(b):b},put:function(b,c){e[b]=f.toJson(c)},remove:function(b){delete e[b]}}}])})(window,window.angular);
//# sourceMappingURL=angular-cookies.min.js.map
/*
AngularJS v1.2.28
(c) 2010-2014 Google, Inc. http://angularjs.org
License: MIT
*/
(function(){'use strict';function d(a){return function(){var c=arguments[0],b,c="["+(a?a+":":"")+c+"] http://errors.angularjs.org/1.2.28/"+(a?a+"/":"")+c;for(b=1;b<arguments.length;b++)c=c+(1==b?"?":"&")+"p"+(b-1)+"="+encodeURIComponent("function"==typeof arguments[b]?arguments[b].toString().replace(/ \{[\s\S]*$/,""):"undefined"==typeof arguments[b]?"undefined":"string"!=typeof arguments[b]?JSON.stringify(arguments[b]):arguments[b]);return Error(c)}}(function(a){var c=d("$injector"),b=d("ng");a=a.angular||
(a.angular={});a.$$minErr=a.$$minErr||d;return a.module||(a.module=function(){var a={};return function(e,d,f){if("hasOwnProperty"===e)throw b("badname","module");d&&a.hasOwnProperty(e)&&(a[e]=null);return a[e]||(a[e]=function(){function a(c,d,e){return function(){b[e||"push"]([c,d,arguments]);return g}}if(!d)throw c("nomod",e);var b=[],h=[],k=a("$injector","invoke"),g={_invokeQueue:b,_runBlocks:h,requires:d,name:e,provider:a("$provide","provider"),factory:a("$provide","factory"),service:a("$provide",
"service"),value:a("$provide","value"),constant:a("$provide","constant","unshift"),animation:a("$animateProvider","register"),filter:a("$filterProvider","register"),controller:a("$controllerProvider","register"),directive:a("$compileProvider","directive"),config:k,run:function(a){h.push(a);return this}};f&&k(f);return g}())}}())})(window)})(window);
//# sourceMappingURL=angular-loader.min.js.map
/*
AngularJS v1.2.28
(c) 2010-2014 Google, Inc. http://angularjs.org
License: MIT
*/
(function(H,a,A){'use strict';function D(p,g){g=g||{};a.forEach(g,function(a,c){delete g[c]});for(var c in p)!p.hasOwnProperty(c)||"$"===c.charAt(0)&&"$"===c.charAt(1)||(g[c]=p[c]);return g}var v=a.$$minErr("$resource"),C=/^(\.[a-zA-Z_$][0-9a-zA-Z_$]*)+$/;a.module("ngResource",["ng"]).factory("$resource",["$http","$q",function(p,g){function c(a,c){this.template=a;this.defaults=c||{};this.urlParams={}}function t(n,w,l){function r(h,d){var e={};d=x({},w,d);s(d,function(b,d){u(b)&&(b=b());var k;if(b&&
b.charAt&&"@"==b.charAt(0)){k=h;var a=b.substr(1);if(null==a||""===a||"hasOwnProperty"===a||!C.test("."+a))throw v("badmember",a);for(var a=a.split("."),f=0,c=a.length;f<c&&k!==A;f++){var g=a[f];k=null!==k?k[g]:A}}else k=b;e[d]=k});return e}function e(a){return a.resource}function f(a){D(a||{},this)}var F=new c(n);l=x({},B,l);s(l,function(h,d){var c=/^(POST|PUT|PATCH)$/i.test(h.method);f[d]=function(b,d,k,w){var q={},n,l,y;switch(arguments.length){case 4:y=w,l=k;case 3:case 2:if(u(d)){if(u(b)){l=
b;y=d;break}l=d;y=k}else{q=b;n=d;l=k;break}case 1:u(b)?l=b:c?n=b:q=b;break;case 0:break;default:throw v("badargs",arguments.length);}var t=this instanceof f,m=t?n:h.isArray?[]:new f(n),z={},B=h.interceptor&&h.interceptor.response||e,C=h.interceptor&&h.interceptor.responseError||A;s(h,function(a,b){"params"!=b&&("isArray"!=b&&"interceptor"!=b)&&(z[b]=G(a))});c&&(z.data=n);F.setUrlParams(z,x({},r(n,h.params||{}),q),h.url);q=p(z).then(function(b){var d=b.data,k=m.$promise;if(d){if(a.isArray(d)!==!!h.isArray)throw v("badcfg",
h.isArray?"array":"object",a.isArray(d)?"array":"object");h.isArray?(m.length=0,s(d,function(b){"object"===typeof b?m.push(new f(b)):m.push(b)})):(D(d,m),m.$promise=k)}m.$resolved=!0;b.resource=m;return b},function(b){m.$resolved=!0;(y||E)(b);return g.reject(b)});q=q.then(function(b){var a=B(b);(l||E)(a,b.headers);return a},C);return t?q:(m.$promise=q,m.$resolved=!1,m)};f.prototype["$"+d]=function(b,a,k){u(b)&&(k=a,a=b,b={});b=f[d].call(this,b,this,a,k);return b.$promise||b}});f.bind=function(a){return t(n,
x({},w,a),l)};return f}var B={get:{method:"GET"},save:{method:"POST"},query:{method:"GET",isArray:!0},remove:{method:"DELETE"},"delete":{method:"DELETE"}},E=a.noop,s=a.forEach,x=a.extend,G=a.copy,u=a.isFunction;c.prototype={setUrlParams:function(c,g,l){var r=this,e=l||r.template,f,p,h=r.urlParams={};s(e.split(/\W/),function(a){if("hasOwnProperty"===a)throw v("badname");!/^\d+$/.test(a)&&(a&&RegExp("(^|[^\\\\]):"+a+"(\\W|$)").test(e))&&(h[a]=!0)});e=e.replace(/\\:/g,":");g=g||{};s(r.urlParams,function(d,
c){f=g.hasOwnProperty(c)?g[c]:r.defaults[c];a.isDefined(f)&&null!==f?(p=encodeURIComponent(f).replace(/%40/gi,"@").replace(/%3A/gi,":").replace(/%24/g,"$").replace(/%2C/gi,",").replace(/%20/g,"%20").replace(/%26/gi,"&").replace(/%3D/gi,"=").replace(/%2B/gi,"+"),e=e.replace(RegExp(":"+c+"(\\W|$)","g"),function(a,c){return p+c})):e=e.replace(RegExp("(/?):"+c+"(\\W|$)","g"),function(a,c,d){return"/"==d.charAt(0)?d:c+d})});e=e.replace(/\/+$/,"")||"/";e=e.replace(/\/\.(?=\w+($|\?))/,".");c.url=e.replace(/\/\\\./,
"/.");s(g,function(a,e){r.urlParams[e]||(c.params=c.params||{},c.params[e]=a)})}};return t}])})(window,window.angular);
//# sourceMappingURL=angular-resource.min.js.map
/* Include this file in your html if you are using the CSP mode. */
@charset "UTF-8";
[ng\:cloak], [ng-cloak], [data-ng-cloak], [x-ng-cloak],
.ng-cloak, .x-ng-cloak,
.ng-hide {
display: none !important;
}
ng\:form {
display: block;
}
.ng-animate-block-transitions {
transition:0s all!important;
-webkit-transition:0s all!important;
}
/* show the element during a show/hide animation when the
* animation is ongoing, but the .ng-hide class is active */
.ng-hide-add-active, .ng-hide-remove {
display: block!important;
}
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -87,8 +87,8 @@ function Final_UpdateTable( action ) {
var UpdateTable = false
if ( (action == "click" && !isCollapsed) || (action=="changerange" && isCollapsed) ) {
UpdateTable = true;
$("#corpusdisplayer").html("Close Folder")
} else $("#corpusdisplayer").html("Open Folder")
$("#corpusdisplayer").html("View by titles")
} else $("#corpusdisplayer").html("View by titles")
pr("update table??: "+UpdateTable)
......@@ -230,9 +230,9 @@ $("#move2trash")
console.log(ids2trash)
$.ajax({
url: "/tests/move2trash/",
data: "nodeids="+JSON.stringify(ids2trash),
type: 'POST',
url : window.location.origin + "/api/nodes?ids="+ids2trash,
//data: 'ids:'+JSON.stringify(ids2trash),
type: 'DELETE',
beforeSend: function(xhr) {
xhr.setRequestHeader("X-CSRFToken", getCookie("csrftoken"));
},
......
......@@ -113,7 +113,7 @@ function Final_UpdateTable( action ) {
if ( (action == "click" && !isCollapsed) || (action=="changerange" && isCollapsed) ) {
UpdateTable = true;
$("#corpusdisplayer").html("Close Folder")
} else $("#corpusdisplayer").html("Open Folder")
} else $("#corpusdisplayer").html("View by journals")
pr("update table??: "+UpdateTable)
......
......@@ -334,7 +334,7 @@ function Final_UpdateTable( action ) {
if ( (action == "click" && !isCollapsed) || (action=="changerange" && isCollapsed) ) {
UpdateTable = true;
$("#corpusdisplayer").html("Close Term List")
} else $("#corpusdisplayer").html("Show Term List")
} else $("#corpusdisplayer").html("View by terms")
pr("update table??: "+UpdateTable)
......
/srv/gargantext_lib/js/libs
\ No newline at end of file
/srv/gargantext_lib/js/settings_explorerjs.js
\ No newline at end of file
/srv/gargantext_lib/js/tinawebJS
\ No newline at end of file
This diff is collapsed.
......@@ -34,9 +34,11 @@
</center>
</div>
</div>
<div class="row">
<div id="monthly-volume-chart"></div>
</div>
<div id="content_loader">
<br>
<center>
......@@ -50,10 +52,9 @@
<div class="panel-heading">
<h4 class="panel-title">
<a data-toggle="collapse" data-target="#journal_table" href="#">
<a data-toggle="collapse" data-parent="#accordion" href="#collapseOne">
<!-- Final_UpdateTable redraws the dynatable if necessary -->
<p id="corpusdisplayer" onclick='Final_UpdateTable("click")' class="btn btn-primary btn-lg">
Open Folder
<p id="corpusdisplayer" onclick='Final_UpdateTable("click")' class="btn btn-primary btn-lg">Journals
</p>
</a>
</h4>
......
......@@ -53,7 +53,7 @@
<div class="panel-heading">
<h4 class="panel-title">
<a data-toggle="collapse" data-target="#terms_table" href="#">
<a data-toggle="collapse" data-parent="#accordion" href="#collapseOne">
<!-- Final_UpdateTable redraws the dynatable if necessary -->
<p id="corpusdisplayer" onclick='Final_UpdateTable("click")' class="btn btn-primary btn-lg">
Close term list
......@@ -78,11 +78,13 @@
</div> <!-- /div panel -->
</div> <!-- /row with the dynatable panels -->
</div> <!-- /jumbotron -->
</div>
<!-- /jumbotron
<button id="ImportList" onclick="GetUserPortfolio();" class="btn btn-warning">
Import a Corpus-List
</button>
-->
</div> <!-- /container -->
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment